Esempio n. 1
0
File: hic.py Progetto: xuanblo/jcvi
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Esempio n. 2
0
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.\
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Esempio n. 3
0
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
    agp = AGP(agpfile)
    scaffolds_seen = set(x.component_id for x in agp)
    sizes = Sizes(scaffolds).mapping
    fwagp = must_open(unplaced_agp, "w")
    for s in sorted(sizes.keys()):
        if s in scaffolds_seen:
            continue
        order_to_agp(s, [(s, "?")], sizes, fwagp)
    logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
Esempio n. 4
0
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
    agp = AGP(agpfile)
    scaffolds_seen = set(x.component_id for x in agp)
    sizes = Sizes(scaffolds).mapping
    fwagp = must_open(unplaced_agp, "w")
    for s in sorted(sizes.keys()):
        if s in scaffolds_seen:
            continue
        order_to_agp(s, [(s, "?")], sizes, fwagp)
    logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
Esempio n. 5
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color", default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout", default=False, action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov,
                                        prefix=prefix, color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes,
                          deltafile, refcov,
                          prefix=prefix, color=color, layout=layout)
Esempio n. 6
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color", default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout", default=False, action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov,
                                        prefix=prefix, color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes,
                          deltafile, refcov,
                          prefix=prefix, color=color, layout=layout)
Esempio n. 7
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid", dest="pctid", default=90, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov", dest="pctcov", default=50, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Esempio n. 8
0
def draw_chromosomes(
    root,
    bedfile,
    sizes,
    iopts,
    mergedist,
    winsize,
    imagemap,
    mappingfile=None,
    gauge=False,
    legend=True,
    empty=False,
    title=None,
):
    bed = Bed(bedfile)
    prefix = bedfile.rsplit(".", 1)[0]

    if imagemap:
        imgmapfile = prefix + ".map"
        mapfh = open(imgmapfile, "w")
        print('<map id="' + prefix + '">', file=mapfh)

    if mappingfile:
        mappings = DictFile(mappingfile, delimiter="\t")
        classes = sorted(set(mappings.values()))
        preset_colors = (DictFile(
            mappingfile, keypos=1, valuepos=2, delimiter="\t")
                         if DictFile.num_columns(mappingfile) >= 3 else {})
    else:
        classes = sorted(set(x.accn for x in bed))
        mappings = dict((x, x) for x in classes)
        preset_colors = {}

    logging.debug("A total of {} classes found: {}".format(
        len(classes), ",".join(classes)))

    # Assign colors to classes
    ncolors = max(3, min(len(classes), 12))
    palette = set1_n if ncolors <= 8 else set3_n
    colorset = palette(number=ncolors)
    colorset = sample_N(colorset, len(classes))
    class_colors = dict(zip(classes, colorset))
    class_colors.update(preset_colors)
    logging.debug("Assigned colors: {}".format(class_colors))

    chr_lens = {}
    centromeres = {}
    if sizes:
        chr_lens = Sizes(sizes).sizes_mapping
    else:
        for b, blines in groupby(bed, key=(lambda x: x.seqid)):
            blines = list(blines)
            maxlen = max(x.end for x in blines)
            chr_lens[b] = maxlen

    for b in bed:
        accn = b.accn
        if accn == "centromere":
            centromeres[b.seqid] = b.start
        if accn in mappings:
            b.accn = mappings[accn]
        else:
            b.accn = "-"

    chr_number = len(chr_lens)
    if centromeres:
        assert chr_number == len(
            centromeres), "chr_number = {}, centromeres = {}".format(
                chr_number, centromeres)

    r = 0.7  # width and height of the whole chromosome set
    xstart, ystart = 0.15, 0.85
    xinterval = r / chr_number
    xwidth = xinterval * 0.5  # chromosome width
    max_chr_len = max(chr_lens.values())
    ratio = r / max_chr_len  # canvas / base

    # first the chromosomes
    for a, (chr, clen) in enumerate(sorted(chr_lens.items())):
        xx = xstart + a * xinterval + 0.5 * xwidth
        root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center")
        if centromeres:
            yy = ystart - centromeres[chr] * ratio
            ChromosomeWithCentromere(root,
                                     xx,
                                     ystart,
                                     yy,
                                     ystart - clen * ratio,
                                     width=xwidth)
        else:
            Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth)

    chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys())))

    alpha = 1
    # color the regions
    for chr in sorted(chr_lens.keys()):
        segment_size, excess = 0, 0
        bac_list = []
        prev_end, prev_klass = 0, None
        for b in bed.sub_bed(chr):
            clen = chr_lens[chr]
            idx = chr_idxs[chr]
            klass = b.accn
            if klass == "centromere":
                continue
            start = b.start
            end = b.end
            if start < prev_end + mergedist and klass == prev_klass:
                start = prev_end
            xx = xstart + idx * xinterval
            yystart = ystart - end * ratio
            yyend = ystart - start * ratio
            root.add_patch(
                Rectangle(
                    (xx, yystart),
                    xwidth,
                    yyend - yystart,
                    fc=class_colors.get(klass, "lightslategray"),
                    lw=0,
                    alpha=alpha,
                ))
            prev_end, prev_klass = b.end, klass

            if imagemap:
                """
                `segment` : size of current BAC being investigated + `excess`
                `excess`  : left-over bases from the previous BAC, as a result of
                            iterating over `winsize` regions of `segment`
                """
                if excess == 0:
                    segment_start = start
                segment = (end - start + 1) + excess
                while True:
                    if segment < winsize:
                        bac_list.append(b.accn)
                        excess = segment
                        break
                    segment_end = segment_start + winsize - 1
                    tlx, tly, brx, bry = (
                        xx,
                        (1 - ystart) + segment_start * ratio,
                        xx + xwidth,
                        (1 - ystart) + segment_end * ratio,
                    )
                    print(
                        "\t" + write_ImageMapLine(
                            tlx,
                            tly,
                            brx,
                            bry,
                            iopts.w,
                            iopts.h,
                            iopts.dpi,
                            chr + ":" + ",".join(bac_list),
                            segment_start,
                            segment_end,
                        ),
                        file=mapfh,
                    )

                    segment_start += winsize
                    segment -= winsize
                    bac_list = []

        if imagemap and excess > 0:
            bac_list.append(b.accn)
            segment_end = end
            tlx, tly, brx, bry = (
                xx,
                (1 - ystart) + segment_start * ratio,
                xx + xwidth,
                (1 - ystart) + segment_end * ratio,
            )
            print(
                "\t" + write_ImageMapLine(
                    tlx,
                    tly,
                    brx,
                    bry,
                    iopts.w,
                    iopts.h,
                    iopts.dpi,
                    chr + ":" + ",".join(bac_list),
                    segment_start,
                    segment_end,
                ),
                file=mapfh,
            )

    if imagemap:
        print("</map>", file=mapfh)
        mapfh.close()
        logging.debug("Image map written to `{0}`".format(mapfh.name))

    if gauge:
        xstart, ystart = 0.9, 0.85
        Gauge(root, xstart, ystart - r, ystart, max_chr_len)

    if "centromere" in class_colors:
        del class_colors["centromere"]

    # class legends, four in a row
    if legend:
        xstart = 0.1
        xinterval = 0.8 / len(class_colors)
        xwidth = 0.04
        yy = 0.08
        for klass, cc in sorted(class_colors.items()):
            if klass == "-":
                continue
            root.add_patch(
                Rectangle((xstart, yy),
                          xwidth,
                          xwidth,
                          fc=cc,
                          lw=0,
                          alpha=alpha))
            root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10)
            xstart += xinterval

    if empty:
        root.add_patch(
            Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1))
        root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10)

    if title:
        root.text(0.5, 0.95, markup(title), ha="center", va="center")
Esempio n. 9
0
def scaffold(args):
    """
    %prog scaffold ctgfasta linksfile

    Use the linksfile to build scaffolds. The linksfile can be
    generated by calling assembly.bundle.link() or assembly.bundle.bundle().
    Use --prefix to place the sequences with same prefix together. The final
    product is an AGP file.
    """
    from jcvi.algorithms.graph import nx
    from jcvi.formats.agp import order_to_agp

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, linksfile = args
    sizes = Sizes(ctgfasta).mapping
    logfile = "scaffold.log"
    fwlog = open(logfile, "w")

    pf = ctgfasta.rsplit(".", 1)[0]
    agpfile = pf + ".agp"
    fwagp = open(agpfile, "w")

    clinks = []
    g = nx.MultiGraph()  # use this to get connected components

    fp = open(linksfile)
    for row in fp:
        c = LinkLine(row)
        distance = max(c.distance, 50)

        g.add_edge(c.aseqid, c.bseqid,
                orientation=c.orientation, distance=distance)

    def get_bname(sname, prefix=False):
        return sname.rsplit("_", 1)[0] if prefix else "chr0"

    scaffoldbuckets = defaultdict(list)
    seqnames = sorted(sizes.keys())

    for h in nx.connected_component_subgraphs(g):
        partialorder = solve_component(h, sizes, fwlog)
        name = partialorder[0][0]
        bname = get_bname(name, prefix=opts.prefix)
        scaffoldbuckets[bname].append(partialorder)

    ctgbuckets = defaultdict(set)
    for name in seqnames:
        bname = get_bname(name, prefix=opts.prefix)
        ctgbuckets[bname].add(name)

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    scafname = "{0}.scf_{1:04d}"
    for bname, ctgs in sorted(ctgbuckets.items()):
        scaffolds = scaffoldbuckets[bname]
        scaffolded = set()
        ctgorder = []
        for scafID, scaf in enumerate(scaffolds):
            ctgorder = []
            for node, start, end, orientation in scaf:
                ctgorder.append((node, orientation))
                scaffolded.add(node)
            scaf = scafname.format(bname, scafID)
            order_to_agp(scaf, ctgorder, sizes, fwagp)
        singletons = sorted(ctgbuckets[bname] - scaffolded)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)

        msg = "{0}: Scaffolds={1} Singletons={2}".\
            format(bname, nscaffolds, nsingletons)
        print >> sys.stderr, msg

        for singleton in singletons:
            ctgorder = [(singleton, "+")]
            order_to_agp(singleton, ctgorder, sizes, fwagp)

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Esempio n. 10
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import AGP, bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)
    seqnames = sorted(sizes.keys())

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])
Esempio n. 11
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid",
                 dest="pctid",
                 default=90,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov",
                 dest="pctcov",
                 default=50,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches +
                                this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                 this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Esempio n. 12
0
def scaffold(args):
    """
    %prog scaffold ctgfasta linksfile

    Use the linksfile to build scaffolds. The linksfile can be
    generated by calling assembly.bundle.link() or assembly.bundle.bundle().
    Use --prefix to place the sequences with same prefix together. The final
    product is an AGP file.
    """
    from jcvi.algorithms.graph import nx
    from jcvi.formats.agp import order_to_agp

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, linksfile = args
    sizes = Sizes(ctgfasta).mapping
    logfile = "scaffold.log"
    fwlog = open(logfile, "w")

    pf = ctgfasta.rsplit(".", 1)[0]
    agpfile = pf + ".agp"
    fwagp = open(agpfile, "w")

    clinks = []
    g = nx.MultiGraph()  # use this to get connected components

    fp = open(linksfile)
    for row in fp:
        c = LinkLine(row)
        distance = max(c.distance, 50)

        g.add_edge(c.aseqid,
                   c.bseqid,
                   orientation=c.orientation,
                   distance=distance)

    def get_bname(sname, prefix=False):
        return sname.rsplit("_", 1)[0] if prefix else "chr0"

    scaffoldbuckets = defaultdict(list)
    seqnames = sorted(sizes.keys())

    for h in nx.connected_component_subgraphs(g):
        partialorder = solve_component(h, sizes, fwlog)
        name = partialorder[0][0]
        bname = get_bname(name, prefix=opts.prefix)
        scaffoldbuckets[bname].append(partialorder)

    ctgbuckets = defaultdict(set)
    for name in seqnames:
        bname = get_bname(name, prefix=opts.prefix)
        ctgbuckets[bname].add(name)

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    scafname = "{0}.scf_{1:04d}"
    for bname, ctgs in sorted(ctgbuckets.items()):
        scaffolds = scaffoldbuckets[bname]
        scaffolded = set()
        ctgorder = []
        for scafID, scaf in enumerate(scaffolds):
            ctgorder = []
            for node, start, end, orientation in scaf:
                ctgorder.append((node, orientation))
                scaffolded.add(node)
            scaf = scafname.format(bname, scafID)
            order_to_agp(scaf, ctgorder, sizes, fwagp)
        singletons = sorted(ctgbuckets[bname] - scaffolded)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)

        msg = "{0}: Scaffolds={1} Singletons={2}".\
            format(bname, nscaffolds, nsingletons)
        print >> sys.stderr, msg

        for singleton in singletons:
            ctgorder = [(singleton, "+")]
            order_to_agp(singleton, ctgorder, sizes, fwagp)

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))