Beispiel #1
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print(a)
        print(b)
        print(om_dist, ch_dist)
Beispiel #2
0
def chimera(args):
    """
    %prog chimera bedfile

    Scan the bed file to break scaffolds that multi-maps.
    """
    p = OptionParser(chimera.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    selected = select_bed(bed)
    mapped = defaultdict(set)  # scaffold => chr
    chimerabed = "chimera.bed"
    fw = open(chimerabed, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        mapped[scf].add(chr)

    nchimera = 0
    for s, chrs in sorted(mapped.items()):
        if len(chrs) == 1:
            continue

        print >> sys.stderr, "=" * 80
        print >> sys.stderr, "{0} mapped to multiple locations: {1}".\
                format(s, ",".join(sorted(chrs)))
        ranges = []
        for b in selected:
            rr = range_parse(b.accn)
            scf = rr.seqid
            if scf == s:
                print >> sys.stderr, b
                ranges.append(rr)

        # Identify breakpoints
        ranges.sort(key=lambda x: (x.seqid, x.start, x.end))
        for a, b in pairwise(ranges):
            seqid = a.seqid
            if seqid != b.seqid:
                continue

            start, end = a.end, b.start
            if start > end:
                start, end = end, start

            chimeraline = "\t".join(str(x) for x in (seqid, start, end))
            print >> fw, chimeraline
            print >> sys.stderr, chimeraline
            nchimera += 1

    fw.close()
    logging.debug("A total of {0} junctions written to `{1}`.".\
                  format(nchimera, chimerabed))
Beispiel #3
0
def patcher(args):
    """
    %prog patcher backbone.bed other.bed

    Given optical map alignment, prepare the patchers. Use --backbone to suggest
    which assembly is the major one, and the patchers will be extracted from
    another assembly.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(patcher.__doc__)
    p.add_option("--backbone",
                 default="OM",
                 help="Prefix of the backbone assembly [default: %default]")
    p.add_option("--object",
                 default="object",
                 help="New object name [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    backbonebed, otherbed = args
    backbonebed = uniq([backbonebed])
    otherbed = uniq([otherbed])

    bb = opts.backbone
    pf = backbonebed.split(".")[0]
    key = lambda x: (x.seqid, x.start, x.end)
    is_bb = lambda x: x.startswith(bb)

    # Make a uniq bed keeping backbone at redundant intervals
    cmd = "intersectBed -v -wa"
    cmd += " -a {0} -b {1}".format(otherbed, backbonebed)
    outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed
    sh(cmd, outfile=outfile)

    uniqbed = Bed()
    uniqbedfile = pf + ".merged.bed"
    uniqbed.extend(Bed(backbonebed))
    uniqbed.extend(Bed(outfile))
    uniqbed.print_to_file(uniqbedfile, sorted=True)

    # Condense adjacent intervals, allow some chaining
    bed = uniqbed
    key = lambda x: range_parse(x.accn).seqid

    bed_fn = pf + ".patchers.bed"
    bed_fw = open(bed_fn, "w")

    for k, sb in groupby(bed, key=key):
        sb = list(sb)
        chr, start, end, strand = merge_ranges(sb)

        id = "{0}:{1}-{2}".format(chr, start, end)
        print >> bed_fw, "\t".join(str(x) for x in \
                (chr, start, end, opts.object, 1000, strand))

    bed_fw.close()
Beispiel #4
0
def patcher(args):
    """
    %prog patcher backbone.bed other.bed

    Given optical map alignment, prepare the patchers. Use --backbone to suggest
    which assembly is the major one, and the patchers will be extracted from
    another assembly.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(patcher.__doc__)
    p.add_option("--backbone", default="OM",
                 help="Prefix of the backbone assembly [default: %default]")
    p.add_option("--object", default="object",
                 help="New object name [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    backbonebed, otherbed = args
    backbonebed = uniq([backbonebed])
    otherbed = uniq([otherbed])

    pf = backbonebed.split(".")[0]
    key = lambda x: (x.seqid, x.start, x.end)

    # Make a uniq bed keeping backbone at redundant intervals
    cmd = "intersectBed -v -wa"
    cmd += " -a {0} -b {1}".format(otherbed, backbonebed)
    outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed
    sh(cmd, outfile=outfile)

    uniqbed = Bed()
    uniqbedfile = pf + ".merged.bed"
    uniqbed.extend(Bed(backbonebed))
    uniqbed.extend(Bed(outfile))
    uniqbed.print_to_file(uniqbedfile, sorted=True)

    # Condense adjacent intervals, allow some chaining
    bed = uniqbed
    key = lambda x: range_parse(x.accn).seqid

    bed_fn = pf + ".patchers.bed"
    bed_fw = open(bed_fn, "w")

    for k, sb in groupby(bed, key=key):
        sb = list(sb)
        chr, start, end, strand = merge_ranges(sb)

        print >> bed_fw, "\t".join(str(x) for x in \
                (chr, start, end, opts.object, 1000, strand))

    bed_fw.close()
Beispiel #5
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq
    from jcvi.utils.iter import pairwise

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print a
        print b
        print om_dist, ch_dist
Beispiel #6
0
def closest(args):
    """
    %prog closest candidates.bed gaps.bed fastafile

    Identify the nearest gaps flanking suggested regions.
    """
    p = OptionParser(closest.__doc__)
    p.add_option(
        "--om",
        default=False,
        action="store_true",
        help="The bedfile is OM blocks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    candidates, gapsbed, fastafile = args
    sizes = Sizes(fastafile).mapping
    bed = Bed(candidates)
    ranges = []
    for b in bed:
        r = range_parse(b.accn) if opts.om else b
        ranges.append([r.seqid, r.start, r.end])

    gapsbed = Bed(gapsbed)
    granges = [(x.seqid, x.start, x.end) for x in gapsbed]

    ranges = range_merge(ranges)
    for r in ranges:
        a = range_closest(granges, r)
        b = range_closest(granges, r, left=False)
        seqid = r[0]

        if a is not None and a[0] != seqid:
            a = None
        if b is not None and b[0] != seqid:
            b = None

        mmin = 1 if a is None else a[1]
        mmax = sizes[seqid] if b is None else b[2]

        print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
Beispiel #7
0
def merge_ranges(beds):

    m = [x.accn for x in beds]

    mr = [range_parse(x) for x in m]
    mc = set(x.seqid for x in mr)
    if len(mc) != 1:
        logging.error("Multiple seqid found in pocket. Aborted.")
        return

    mc = list(mc)[0]
    ms = min(x.start for x in mr)
    me = max(x.end for x in mr)

    neg_strands = sum(1 for x in beds if x.strand == "-")
    pos_strands = len(beds) - neg_strands
    strand = "-" if neg_strands > pos_strands else "+"

    return mc, ms, me, strand
Beispiel #8
0
def merge_ranges(beds):

    m = [x.accn for x in beds]

    mr = [range_parse(x) for x in m]
    mc = set(x.seqid for x in mr)
    if len(mc) != 1:
        logging.error("Multiple seqid found in pocket. Aborted.")
        return

    mc = list(mc)[0]
    ms = min(x.start for x in mr)
    me = max(x.end for x in mr)

    neg_strands = sum(1 for x in beds if x.strand == '-')
    pos_strands = len(beds) - neg_strands
    strand = '-' if neg_strands > pos_strands else '+'

    return mc, ms, me, strand
Beispiel #9
0
def closest(args):
    """
    %prog closest candidates.bed gaps.bed fastafile

    Identify the nearest gaps flanking suggested regions.
    """
    p = OptionParser(closest.__doc__)
    p.add_option("--om", default=False, action="store_true",
                 help="The bedfile is OM blocks [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    candidates, gapsbed, fastafile = args
    sizes = Sizes(fastafile).mapping
    bed = Bed(candidates)
    ranges = []
    for b in bed:
        r = range_parse(b.accn) if opts.om else b
        ranges.append([r.seqid, r.start, r.end])

    gapsbed = Bed(gapsbed)
    granges = [(x.seqid, x.start, x.end) for x in gapsbed]

    ranges = range_merge(ranges)
    for r in ranges:
        a = range_closest(granges, r)
        b = range_closest(granges, r, left=False)
        seqid = r[0]

        if a is not None and a[0] != seqid:
            a = None
        if b is not None and b[0] != seqid:
            b = None

        mmin = 1 if a is None else a[1]
        mmax = sizes[seqid] if b is None else b[2]

        print "\t".join(str(x) for x in (seqid, mmin - 1, mmax))
Beispiel #10
0
def fasta(args):
    """
    %prog fasta bedfile scf.fasta pseudomolecules.fasta

    Use OM bed to scaffold and create pseudomolecules. bedfile can be generated
    by running jcvi.assembly.opticalmap bed --blockonly
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.agp import OO, build

    p = OptionParser(fasta.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, scffasta, pmolfasta = args
    pf = bedfile.rsplit(".", 1)[0]
    bed = Bed(bedfile)
    selected = select_bed(bed)
    oo = OO()
    seen = set()
    sizes = Sizes(scffasta).mapping
    agpfile = pf + ".agp"
    agp = open(agpfile, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        cs = (chr, scf)
        if cs not in seen:
            oo.add(chr, scf, sizes[scf], b.strand)
            seen.add(cs)
        else:
            logging.debug("Seen {0}, ignored.".format(cs))

    oo.write_AGP(agp, gaptype="contig")
    agp.close()
    build([agpfile, scffasta, pmolfasta])
Beispiel #11
0
def fasta(args):
    """
    %prog fasta bedfile scf.fasta pseudomolecules.fasta

    Use OM bed to scaffold and create pseudomolecules. bedfile can be generated
    by running jcvi.assembly.opticalmap bed --blockonly
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.agp import OO, build

    p = OptionParser(fasta.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, scffasta, pmolfasta = args
    pf = bedfile.rsplit(".", 1)[0]
    bed = Bed(bedfile)
    selected = select_bed(bed)
    oo = OO()
    seen = set()
    sizes = Sizes(scffasta).mapping
    agpfile = pf + ".agp"
    agp = open(agpfile, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        cs = (chr, scf)
        if cs not in seen:
            oo.add(chr, scf, sizes[scf], b.strand)
            seen.add(cs)
        else:
            logging.debug("Seen {0}, ignored.".format(cs))

    oo.write_AGP(agp, gaptype="contig")
    agp.close()
    build([agpfile, scffasta, pmolfasta])
Beispiel #12
0
def test_range_parse(input, expected):
    from jcvi.utils.range import range_parse

    assert range_parse(input) == expected