Esempio n. 1
0
def tips(args):
    """
    %prog tips patchers.bed complements.bed original.fasta backbone.fasta

    Append telomeric sequences based on patchers and complements.
    """
    p = OptionParser(tips.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbedfile, cbedfile, sizesfile, bbfasta = args

    pbed = Bed(pbedfile, sorted=False)
    cbed = Bed(cbedfile, sorted=False)

    complements = dict()
    for object, beds in groupby(cbed, key=lambda x: x.seqid):
        beds = list(beds)
        complements[object] = beds

    sizes = Sizes(sizesfile).mapping
    bbsizes = Sizes(bbfasta).mapping
    tbeds = []

    for object, beds in groupby(pbed, key=lambda x: x.accn):
        beds = list(beds)
        startbed, endbed = beds[0], beds[-1]
        start_id, end_id = startbed.seqid, endbed.seqid
        if startbed.start == 1:
            start_id = None
        if endbed.end == sizes[end_id]:
            end_id = None
        print(object, start_id, end_id, file=sys.stderr)
        if start_id:
            b = complements[start_id][0]
            b.accn = object
            tbeds.append(b)
        tbeds.append(
            BedLine(
                "\t".join(
                    str(x) for x in (object, 0, bbsizes[object], object, 1000, "+")
                )
            )
        )
        if end_id:
            b = complements[end_id][-1]
            b.accn = object
            tbeds.append(b)

    tbed = Bed()
    tbed.extend(tbeds)

    tbedfile = "tips.bed"
    tbed.print_to_file(tbedfile)
Esempio n. 2
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed, BedLine
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch",
                 default=False,
                 action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale",
                 type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        bedline = "\t".join(
            str(x) for x in (qseqid, qstart - 1, qend,
                             "{0}:{1}".format(get_number(sseqid), sstart)))
        bd.append(BedLine(bedline))

    bd.print_to_file(filename=opts.outfile, sorted=True)
Esempio n. 3
0
def liftover(args):
    """
    %prog liftover agpfile bedfile

    Given coordinates in components, convert to the coordinates in chromosomes.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Prepend prefix to accn names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile).order
    bed = Bed(bedfile)
    newbed = Bed()
    for b in bed:
        component = b.seqid
        if component not in agp:
            newbed.append(b)
            continue

        i, a = agp[component]

        assert a.component_beg < a.component_end
        arange = a.component_beg, a.component_end
        assert b.start < b.end
        brange = b.start, b.end

        st = range_intersect(arange, brange)
        if not st:
            continue
        start, end = st
        assert start <= end

        if a.orientation == '-':
            d = a.object_end + a.component_beg
            s, t = d - end, d - start
        else:
            d = a.object_beg - a.component_beg
            s, t = d + start, d + end

        name = b.accn.replace(" ", "_")
        if opts.prefix:
            name = component + "_" + name
        bline = "\t".join(str(x) for x in (a.object, s - 1, t, name))
        newbed.append(BedLine(bline))

    newbed.print_to_file(sorted=True)
Esempio n. 4
0
def mergebed(args):
    """
    %prog mergebed map1.bed map2.bed map3.bed ...

    Combine bed maps to bed format, adding the map name.
    """
    p = OptionParser(mergebed.__doc__)
    p.add_option("-w",
                 "--weightsfile",
                 default="weights.txt",
                 help="Write weights to file")
    p.set_outfile("out.bed")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    maps = args
    outfile = opts.outfile
    fp = must_open(maps)
    b = Bed()
    mapnames = set()
    for row in fp:
        mapname = fp.filename().split(".")[0]
        mapnames.add(mapname)
        try:
            m = BedLine(row)
            m.accn = "{0}-{1}".format(mapname, m.accn)
            m.extra = ["{0}:{1}".format(m.seqid, m.start)]
            b.append(m)
        except (IndexError, ValueError):  # header or mal-formed line
            continue

    b.print_to_file(filename=outfile, sorted=True)
    logging.debug("A total of {0} markers written to `{1}`.".\
                        format(len(b), outfile))

    assert len(maps) == len(mapnames), "You have a collision in map names"
    write_weightsfile(mapnames, weightsfile=opts.weightsfile)
Esempio n. 5
0
def mergebed(args):
    """
    %prog mergebed map1.bed map2.bed map3.bed ...

    Combine bed maps to bed format, adding the map name.
    """
    p = OptionParser(mergebed.__doc__)
    p.add_option("-w", "--weightsfile", default="weights.txt",
                 help="Write weights to file")
    p.set_outfile("out.bed")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    maps = args
    outfile = opts.outfile
    fp = must_open(maps)
    b = Bed()
    mapnames = set()
    for row in fp:
        mapname = fp.filename().split(".")[0]
        mapnames.add(mapname)
        try:
            m = BedLine(row)
            m.accn = "{0}-{1}".format(mapname, m.accn)
            m.extra = ["{0}:{1}".format(m.seqid, m.start)]
            b.append(m)
        except (IndexError, ValueError):  # header or mal-formed line
            continue

    b.print_to_file(filename=outfile, sorted=True)
    logging.debug("A total of {0} markers written to `{1}`.".\
                        format(len(b), outfile))

    assert len(maps) == len(mapnames), "You have a collision in map names"
    write_weightsfile(mapnames, weightsfile=opts.weightsfile)
Esempio n. 6
0
File: vcf.py Progetto: zjwang6/jcvi
def location(args):
    """
    %prog location bedfile fastafile

    Given SNP locations, summarize the locations in the sequences. For example,
    find out if there are more 3`-SNPs than 5`-SNPs.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(location.__doc__)
    p.add_option(
        "--dist",
        default=100,
        type="int",
        help="Distance cutoff to call 5` and 3` [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, fastafile = args
    dist = opts.dist
    sizes = Sizes(fastafile).mapping
    fp = open(bedfile)
    fiveprime = threeprime = total = 0
    percentages = []
    for row in fp:
        b = BedLine(row)
        pos = b.start
        size = sizes[b.seqid]
        if pos < dist:
            fiveprime += 1
        if size - pos < dist:
            threeprime += 1
        total += 1
        percentages.append(100 * pos / size)

    m = "Five prime (within {0}bp of start codon): {1}\n".format(
        dist, fiveprime)
    m += "Three prime (within {0}bp of stop codon): {1}\n".format(
        dist, threeprime)
    m += "Total: {0}".format(total)
    print(m, file=sys.stderr)

    bins = 10
    title = "Locations within the gene [0=Five-prime, 100=Three-prime]"
    stem_leaf_plot(percentages, 0, 100, bins, title=title)
Esempio n. 7
0
def merge(args):
    """
    %prog merge map1 map2 map3 ...

    Convert csv maps to bed format.

    Each input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    scaffold_759,81336,1,9.7
    """
    p = OptionParser(merge.__doc__)
    p.add_option("-w",
                 "--weightsfile",
                 default="weights.txt",
                 help="Write weights to file")
    p.set_outfile("out.bed")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    maps = args
    outfile = opts.outfile
    fp = must_open(maps)
    b = Bed()
    mapnames = set()
    for row in fp:
        mapname = fp.filename().split(".")[0]
        mapnames.add(mapname)
        try:
            m = CSVMapLine(row, mapname=mapname)
            if m.cm < 0:
                logging.error("Ignore marker with negative genetic distance")
                print >> sys.stderr, row.strip()
            else:
                b.append(BedLine(m.bedline))
        except (IndexError, ValueError):  # header or mal-formed line
            continue

    b.print_to_file(filename=outfile, sorted=True)
    logging.debug("A total of {0} markers written to `{1}`.".\
                        format(len(b), outfile))

    assert len(maps) == len(mapnames), "You have a collision in map names"
    write_weightsfile(mapnames, weightsfile=opts.weightsfile)
Esempio n. 8
0
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option(
        "--prefix",
        default="scaffold",
        help="Prefix of the unplaced scaffolds",
    )
    p.add_option(
        "--minlinks",
        default=3,
        type="int",
        help="Minimum number of links to place",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print(file=log)
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print(a, file=log)
            print(b, file=log)

            flip_b = astrand == bstrand
            fbstrand = "-" if flip_b else "+"
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ("+", "-")
            if astrand == "+":
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print("*" + "\t".join(str(x) for x in start_range), file=log)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print(alldepths, file=log)

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        if nseqids != 1:
            msg = "Multiple conflicting candidates found"
            print(msg, file=log)
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])

        if mmin >= mmax:
            msg = "Invalid (min, max) range"
            print("Invalid (min, max) range", file=log)
            continue

        if (mmax - mmin) > maxdist:
            msg = "(min, max) distance greater than library maxdist"
            print(msg, file=log)
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == "+":
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = "+" if nplus >= nminus else "-"

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log)
        print(candidate, file=log)

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)
Esempio n. 9
0
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == '-':
                b.extra[1] = b.strand = ('-' if b.strand == '+' else '+')

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, \
            "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
Esempio n. 10
0
def qc(args):
    """
    %prog qc prefix

    Expects data files including:
    1. `prefix.bedpe` draws Bezier curve between paired reads
    2. `prefix.sizes` draws length of the contig/scaffold
    3. `prefix.gaps.bed` mark the position of the gaps in sequence
    4. `prefix.bed.coverage` plots the base coverage
    5. `prefix.pairs.bed.coverage` plots the clone coverage

    See assembly.coverage.posmap() for the generation of these files.
    """
    from jcvi.graphics.glyph import Bezier

    p = OptionParser(qc.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    prefix, = args
    scf = prefix

    # All these files *must* be present in the current folder
    bedpefile = prefix + ".bedpe"
    fastafile = prefix + ".fasta"
    sizesfile = prefix + ".sizes"
    gapsbedfile = prefix + ".gaps.bed"
    bedfile = prefix + ".bed"
    bedpefile = prefix + ".bedpe"
    pairsbedfile = prefix + ".pairs.bed"

    sizes = Sizes(fastafile).mapping
    size = sizes[scf]

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    # the scaffold
    root.add_patch(Rectangle((.1, .15), .8, .03, fc='k'))

    # basecoverage and matecoverage
    ax = fig.add_axes([.1, .45, .8, .45])

    bins = 200  # Smooth the curve
    basecoverage = Coverage(bedfile, sizesfile)
    matecoverage = Coverage(pairsbedfile, sizesfile)

    x, y = basecoverage.get_plot_data(scf, bins=bins)
    baseline, = ax.plot(x, y, 'g-')
    x, y = matecoverage.get_plot_data(scf, bins=bins)
    mateline, = ax.plot(x, y, 'r-')
    legends = ("Base coverage", "Mate coverage")
    leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)
    ax.set_xlim(0, size)

    # draw the read pairs
    fp = open(bedpefile)
    pairs = []
    for row in fp:
        scf, astart, aend, scf, bstart, bend, clonename = row.split()
        astart, bstart = int(astart), int(bstart)
        aend, bend = int(aend), int(bend)
        start = min(astart, bstart) + 1
        end = max(aend, bend)
        pairs.append((start, end))

    bpratio = .8 / size
    cutoff = 1000  # inserts smaller than this are not plotted
    # this convert from base => x-coordinate
    pos = lambda x: (.1 + x * bpratio)
    ypos = .15 + .03
    for start, end in pairs:
        dist = end - start

        if dist < cutoff:
            continue

        dist = min(dist, 10000)
        # 10Kb == .25 canvas height
        height = .25 * dist / 10000
        xstart = pos(start)
        xend = pos(end)
        p0 = (xstart, ypos)
        p1 = (xstart, ypos + height)
        p2 = (xend, ypos + height)
        p3 = (xend, ypos)
        Bezier(root, p0, p1, p2, p3)

    # gaps on the scaffold
    fp = open(gapsbedfile)
    for row in fp:
        b = BedLine(row)
        start, end = b.start, b.end
        xstart = pos(start)
        xend = pos(end)
        root.add_patch(Rectangle((xstart, .15), xend - xstart, .03, fc='w'))

    root.text(.5, .1, scf, color='b', ha="center")
    warn_msg = "Only the inserts > {0}bp are shown".format(cutoff)
    root.text(.5, .1, scf, color='b', ha="center")
    root.text(.5, .05, warn_msg, color='gray', ha="center")
    # clean up and output
    set_human_base_axis(ax)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = prefix + ".pdf"
    savefig(figname, dpi=300)
Esempio n. 11
0
def bed_to_bedpe(bedfile,
                 bedpefile,
                 pairsbedfile=None,
                 matesfile=None,
                 ca=False,
                 strand=False):
    """
    This converts the bedfile to bedpefile, assuming the reads are from CA.
    """
    fp = must_open(bedfile)
    fw = must_open(bedpefile, "w")
    if pairsbedfile:
        fwpairs = must_open(pairsbedfile, "w")

    clones = defaultdict(list)
    for row in fp:
        b = BedLine(row)
        name = b.accn
        clonename = clone_name(name, ca=ca)
        clones[clonename].append(b)

    if matesfile:
        fp = open(matesfile)
        libraryline = next(fp)
        # 'library bes     37896   126916'
        lib, name, smin, smax = libraryline.split()
        assert lib == "library"
        smin, smax = int(smin), int(smax)
        logging.debug("Happy mates for lib {0} fall between {1} - {2}".format(
            name, smin, smax))

    nbedpe = 0
    nspan = 0
    for clonename, blines in clones.items():
        nlines = len(blines)
        if nlines == 2:
            a, b = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = b.seqid, b.start, b.end
            outcols = [
                aseqid, astart - 1, aend, bseqid, bstart - 1, bend, clonename
            ]
            if strand:
                outcols.extend([0, a.strand, b.strand])
            print("\t".join(str(x) for x in outcols), file=fw)
            nbedpe += 1
        elif nlines == 1:
            (a, ) = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = 0, 0, 0
        else:  # More than two lines per pair
            pass

        if pairsbedfile:
            start = min(astart, bstart) if bstart > 0 else astart
            end = max(aend, bend) if bend > 0 else aend
            if aseqid != bseqid:
                continue

            span = end - start + 1
            if (not matesfile) or (smin <= span <= smax):
                print(
                    "\t".join(
                        str(x) for x in (aseqid, start - 1, end, clonename)),
                    file=fwpairs,
                )
                nspan += 1

    fw.close()
    logging.debug("A total of {0} bedpe written to `{1}`.".format(
        nbedpe, bedpefile))
    if pairsbedfile:
        fwpairs.close()
        logging.debug("A total of {0} spans written to `{1}`.".format(
            nspan, pairsbedfile))
Esempio n. 12
0
def link(args):
    """
    %prog link bedfile fastafile

    Construct contig links based on bed file. Use --prefix to limit the links
    between contigs that start with the same prefix_xxx.
    """
    p = OptionParser(link.__doc__)
    p.add_option("--insert",
                 type="int",
                 default=0,
                 help="Mean insert size [default: estimate from data]")
    p.add_option("--cutoff", type="int", default=0,
            help="Largest distance expected for linkage " + \
                 "[default: estimate from data]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]"
    )
    p.add_option(
        "--debug",
        dest="debug",
        default=False,
        action="store_true",
        help="Print verbose info when checking mates [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, fastafile = args
    debug = opts.debug
    cutoff = opts.cutoff

    sizes = Sizes(fastafile)

    cutoffopt = "--cutoff={0}".format(cutoff)
    mateorientationopt = '--mateorientation=+-'
    bedfile, (meandist, stdev, p0, p1, p2) = \
            pairs([bedfile, cutoffopt, mateorientationopt])

    maxcutoff = cutoff or p2
    insert = opts.insert or p0
    logging.debug("Mate hangs must be <= {0}, --cutoff to override".\
            format(maxcutoff))

    rs = lambda x: x.accn[:-1]

    fp = open(bedfile)
    linksfile = bedfile.rsplit(".", 1)[0] + ".links"
    fw = open(linksfile, "w")

    for a, b in pairwise(fp):
        """
        Criteria for valid contig edge
        1. for/rev do not mapping to the same scaffold (useful for linking)
        2. assuming innie (outie must be flipped first), order the contig pair
        3. calculate sequence hangs, valid hangs are smaller than insert size
        """
        a, b = BedLine(a), BedLine(b)

        if rs(a) != rs(b):
            continue
        pe = rs(a)

        # Intra-contig links
        if a.seqid == b.seqid:
            continue

        # Use --prefix to limit the links between seqids with the same prefix
        # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002
        if opts.prefix:
            aprefix = a.seqid.split("_")[0]
            bprefix = b.seqid.split("_")[0]
            if aprefix != bprefix:
                continue

        cl = ContigLink(a, b, insert=insert, cutoff=maxcutoff)
        if cl.flip_innie(sizes, debug=debug):
            print >> fw, "\t".join((pe, str(cl)))
Esempio n. 13
0
def rename(args):
    """
    %prog rename genes.bed [gaps.bed]

    Rename genes for annotation release.

    For genes on chromosomes (e.g. the 12th gene on C1):
    Bo1g00120

    For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285):
    Bo00285s120

    The genes identifiers will increment by 10. So assuming no gap, these are
    the consecutive genes:
    Bo1g00120, Bo1g00130, Bo1g00140...
    Bo00285s120, Bo00285s130, Bo00285s140...

    When we encounter gaps, we would like the increment to be larger. For example,
    Bo1g00120, <gap>, Bo1g01120...

    Gaps bed file is optional.
    """
    import string

    p = OptionParser(rename.__doc__)
    p.add_option("-a",
                 dest="gene_increment",
                 default=10,
                 type="int",
                 help="Increment for continuous genes [default: %default]")
    p.add_option("-b",
                 dest="gap_increment",
                 default=1000,
                 type="int",
                 help="Increment for gaps [default: %default]")
    p.add_option("--pad0",
                 default=6,
                 type="int",
                 help="Pad gene identifiers with 0 [default: %default]")
    p.add_option(
        "--spad0",
        default=4,
        type="int",
        help="Pad gene identifiers on small scaffolds [default: %default]")
    p.add_option("--prefix",
                 default="Bo",
                 help="Genome prefix [default: %default]")
    p.add_option("--jgi", default=False, action="store_true",
                 help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \
                      " [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    genebed = args[0]
    gapbed = args[1] if len(args) == 2 else None
    prefix = opts.prefix
    gene_increment = opts.gene_increment
    gap_increment = opts.gap_increment

    genes = Bed(genebed)
    if gapbed:
        fp = open(gapbed)
        for row in fp:
            genes.append(BedLine(row))

    genes.sort(key=genes.key)
    idsfile = prefix + ".ids"
    newbedfile = prefix + ".bed"
    gap_increment -= gene_increment
    assert gap_increment >= 0

    if opts.jgi:
        prefix += "."
    fw = open(idsfile, "w")
    for chr, lines in groupby(genes, key=lambda x: x.seqid):
        lines = list(lines)
        pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0
        isChr = chr[0].upper() == 'C'
        digits = "".join(x for x in chr if x in string.digits)
        gs = "g" if isChr else "s"
        pp = prefix + digits + gs
        idx = 0
        if isChr:
            idx += gap_increment

        for r in lines:
            isGap = r.strand not in ("+", "-")
            if isGap:
                idx += gap_increment
                continue
            else:
                idx += gene_increment
            accn = pp + "{0:0{1}d}".format(idx, pad0)
            oldaccn = r.accn
            print >> fw, "\t".join((oldaccn, accn))
            r.accn = accn

    genes.print_to_file(newbedfile)
    logging.debug("Converted IDs written to `{0}`.".format(idsfile))
    logging.debug("Converted bed written to `{0}`.".format(newbedfile))
Esempio n. 14
0
 def bedline(self):
     score = "1000" if self.score == '.' else self.score
     row = "\t".join((self.seqid, str(self.start - 1), str(self.end),
                      self.accn, score, self.strand))
     return BedLine(row)
Esempio n. 15
0
 def __init__(self, fh):
     line = fh.readline().strip()
     self.id = BedLine(line).accn if line else None
Esempio n. 16
0
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == "-":
                b.extra[1] = b.strand = "-" if b.strand == "+" else "+"

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
Esempio n. 17
0
    def allocate(self, info, chr, start_id, end_id, id_table):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = chr_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [
            x for x in xrange(start_id + 1, end_id)
            if (current_chr, x) not in self.black
        ]

        message = "{0} need {1} ids, has {2} spots ({3} available)".\
                format(chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \
                pad0=self.pad0, uc=self.uc)
        end_gene = gene_name(current_chr,
                             end_id,
                             prefix=self.prefix,
                             pad0=self.pad0,
                             uc=self.uc)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None:  # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else:  # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[-nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank, prefix=self.prefix, \
                    pad0=self.pad0, uc=self.uc)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr
Esempio n. 18
0
def insert(args):
    """
    %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta

    Insert scaffolds into assembly.
    """
    from jcvi.formats.agp import mask, bed
    from jcvi.formats.sizes import agp

    p = OptionParser(insert.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    candidates, gapsbed, chrfasta, unplacedfasta = args
    refinedbed = refine([candidates, gapsbed])
    sizes = Sizes(unplacedfasta).mapping
    cbed = Bed(candidates)
    corder = cbed.order
    gbed = Bed(gapsbed)
    gorder = gbed.order

    gpbed = Bed()
    gappositions = {}  # (chr, start, end) => gapid

    fp = open(refinedbed)
    gap_to_scf = defaultdict(list)
    seen = set()
    for row in fp:
        atoms = row.split()
        unplaced = atoms[3]
        strand = atoms[5]
        gapid = atoms[9]
        if gapid not in seen:
            seen.add(gapid)
            gi, gb = gorder[gapid]
            gpbed.append(gb)
            gappositions[(gb.seqid, gb.start, gb.end)] = gapid
        gap_to_scf[gapid].append((unplaced, strand))

    gpbedfile = "candidate.gaps.bed"
    gpbed.print_to_file(gpbedfile, sorted=True)

    agpfile = agp([chrfasta])
    maskedagpfile = mask([agpfile, gpbedfile])
    maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed"
    bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)])

    mbed = Bed(maskedbedfile)
    beds = []
    for b in mbed:
        sid = b.seqid
        key = (sid, b.start, b.end)
        if key not in gappositions:
            beds.append(b)
            continue

        gapid = gappositions[key]
        scfs = gap_to_scf[gapid]

        # For scaffolds placed in the same gap, sort according to positions
        scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end)
        for scf, strand in scfs:
            size = sizes[scf]
            beds.append(BedLine("\t".join(str(x) for x in \
                    (scf, 0, size, sid, 1000, strand))))

    finalbed = Bed()
    finalbed.extend(beds)
    finalbedfile = "final.bed"
    finalbed.print_to_file(finalbedfile)

    # Clean-up
    toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile]
    FileShredder(toclean)
Esempio n. 19
0
def simple(args):
    """
    %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Write the block ends for each block in the anchorfile.
    GeneA1    GeneA2    GeneB1    GeneB2   +/-      score

    Optional additional columns:
    orderA1   orderA2   orderB1   orderB2  sizeA    sizeB   size    block_id

    With base coordinates (--coords):
    block_id  seqidA    startA    endA     bpSpanA  GeneA1   GeneA2  geneSpanA
    block_id  seqidB    startB    endB     bpSpanB  GeneB1   GeneB2  geneSpanB
    """
    p = OptionParser(simple.__doc__)
    p.add_option("--rich", default=False, action="store_true", \
                help="Output additional columns [default: %default]")
    p.add_option(
        "--coords",
        default=False,
        action="store_true",
        help="Output columns with base coordinates [default: %default]")
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Generate BED file for the blocks")
    p.add_option("--noheader",
                 default=False,
                 action="store_true",
                 help="Don't output header [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    additional = opts.rich
    coords = opts.coords
    header = not opts.noheader
    bed = opts.bed
    if bed:
        coords = True
        bbed = Bed()

    ac = AnchorFile(anchorfile)
    simplefile = anchorfile.rsplit(".", 1)[0] + ".simple"

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    pf = "-".join(anchorfile.split(".", 2)[:2])
    blocks = ac.blocks

    if coords:
        h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation"
    else:
        h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score"
        if additional:
            h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\
                  "SizeA|SizeB|Size|Block"

    fws = open(simplefile, "w")
    if header:
        print >> fws, "\t".join(h.split("|"))

    atotalbase = btotalbase = 0
    for i, block in enumerate(blocks):

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)

        astarti, aendi = min(ia), max(ia)
        bstarti, bendi = min(ib), max(ib)
        astart, aend = min(a)[1].accn, max(a)[1].accn
        bstart, bend = min(b)[1].accn, max(b)[1].accn

        sizeA = len(set(ia))
        sizeB = len(set(ib))
        size = len(block)

        slope, intercept = np.polyfit(ia, ib, 1)
        orientation = "+" if slope >= 0 else '-'
        aspan = aendi - astarti + 1
        bspan = bendi - bstarti + 1
        score = int((aspan * bspan)**.5)
        score = str(score)
        block_id = pf + "-block-{0}".format(i)

        if coords:

            aseqid, astartbase, aendbase = \
                    get_boundary_bases(astart, aend, qorder)
            bseqid, bstartbase, bendbase = \
                    get_boundary_bases(bstart, bend, sorder)
            abase = aendbase - astartbase + 1
            bbase = bendbase - bstartbase + 1
            atotalbase += abase
            btotalbase += bbase

            # Write dual lines
            aargs = [
                block_id, aseqid, astartbase, aendbase, abase, astart, aend,
                aspan, "+"
            ]
            bargs = [
                block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend,
                bspan, orientation
            ]

            if bed:
                bbed.append(BedLine("\t".join(str(x) for x in \
                           (bseqid, bstartbase - 1, bendbase,
                           "{}:{}-{}".format(aseqid, astartbase, aendbase),
                           size, orientation))))

            for args in (aargs, bargs):
                print >> fws, "\t".join(str(x) for x in args)
            continue

        args = [astart, aend, bstart, bend, score, orientation]
        if additional:
            args += [
                astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id
            ]
        print >> fws, "\t".join(str(x) for x in args)

    fws.close()
    logging.debug("A total of {0} blocks written to `{1}`.".format(
        i + 1, simplefile))

    if coords:
        print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \
                        human_size(atotalbase, precision=2))
        print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \
                        human_size(btotalbase, precision=2))
        print >> sys.stderr, "Ratio: {0:.1f}x".format(\
                        max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase))

    if bed:
        bedfile = simplefile + ".bed"
        bbed.print_to_file(filename=bedfile, sorted=True)
        logging.debug("Bed file written to `{}`".format(bedfile))
Esempio n. 20
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)