Example #1
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang", default=GoodOverhang, type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl, a.component_span, b.component_span,
                        cutoff, qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
Example #2
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang",
                 default=GoodOverhang,
                 type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl,
                    a.component_span,
                    b.component_span,
                    cutoff,
                    qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta