Example #1
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps(
        [scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
Example #2
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
Example #3
0
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >>sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >>sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >>sys.stderr, "* Does not allow backward" " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >>sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])
Example #4
0
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length", default=200, type="int",
                 help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps([scaffolds, "--split",
                                   "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >> sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >> sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >> sys.stderr, "* Does not allow backward" \
                                     " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >> sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, \
                                 delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, \
                                 delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, \
                                 delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])