Exemple #1
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Exemple #2
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Exemple #3
0
    def rstats(self, object, bacs, components, scaffold_sizes, length):
        from jcvi.utils.cbook import human_size

        nbacs = len(bacs)
        nscaffolds = len(scaffold_sizes)
        a50, l50, n50 = calculate_A50(scaffold_sizes)
        l50 = human_size(l50)
        length = human_size(length)

        return (object, nbacs, components, nscaffolds, n50, l50, length)
Exemple #4
0
    def rstats(self, object, bacs, components, scaffold_sizes, length):
        from jcvi.utils.cbook import human_size

        nbacs = len(bacs)
        nscaffolds = len(scaffold_sizes)
        a50, l50, n50 = calculate_A50(scaffold_sizes)
        l50 = human_size(l50)
        length = human_size(length)

        return (object, nbacs, components, nscaffolds, n50, l50, length)
Exemple #5
0
def get_stats(blastfile, strict=False):
    from jcvi.utils.range import range_union, range_span
    from .pyblast import BlastLine

    logging.debug("Report stats on `%s`" % blastfile)
    fp = open(blastfile)
    ref_ivs = []
    qry_ivs = []
    identicals = 0
    ngaps = 0
    alignlens = []

    for row in fp:
        c = BlastLine(row)
        qstart, qstop = c.qstart, c.qstop
        if qstart > qstop:
            qstart, qstop = qstop, qstart
        qry_ivs.append((c.query, qstart, qstop))

        sstart, sstop = c.sstart, c.sstop
        if sstart > sstop:
            sstart, sstop = sstop, sstart
        ref_ivs.append((c.subject, sstart, sstop))

        alen = c.hitlen
        ngaps += c.ngaps
        identicals += c.hitlen - c.nmismatch - c.ngaps
        alignlens.append(alen)

    qrycovered = range_union(qry_ivs)
    refcovered = range_union(ref_ivs)
    if strict:
        # We discount gaps in counting covered bases, since we
        # did not track individually gaps in qry and ref, we assume
        # the gaps are opened evenly in the two sequences
        qrycovered -= ngaps / 2
        refcovered -= ngaps / 2
    qryspan = range_span(qry_ivs)
    refspan = range_span(ref_ivs)
    _, AL50, _ = calculate_A50(alignlens)
    filename = op.basename(blastfile)
    alignstats = AlignStats(
        filename, qrycovered, refcovered, qryspan, refspan, identicals, AL50
    )

    return alignstats
Exemple #6
0
def get_stats(blastfile, strict=False):
    from jcvi.utils.range import range_union, range_span
    from .pyblast import BlastLine

    logging.debug("Report stats on `%s`" % blastfile)
    fp = open(blastfile)
    ref_ivs = []
    qry_ivs = []
    identicals = 0
    ngaps = 0
    alignlens = []

    for row in fp:
        c = BlastLine(row)
        qstart, qstop = c.qstart, c.qstop
        if qstart > qstop:
            qstart, qstop = qstop, qstart
        qry_ivs.append((c.query, qstart, qstop))

        sstart, sstop = c.sstart, c.sstop
        if sstart > sstop:
            sstart, sstop = sstop, sstart
        ref_ivs.append((c.subject, sstart, sstop))

        alen = c.hitlen
        ngaps += c.ngaps
        identicals += c.hitlen - c.nmismatch - c.ngaps
        alignlens.append(alen)

    qrycovered = range_union(qry_ivs)
    refcovered = range_union(ref_ivs)
    if strict:
        # We discount gaps in counting covered bases, since we
        # did not track individually gaps in qry and ref, we assume
        # the gaps are opened evenly in the two sequences
        qrycovered -= ngaps / 2
        refcovered -= ngaps / 2
    qryspan = range_span(qry_ivs)
    refspan = range_span(ref_ivs)
    _, AL50, _ = calculate_A50(alignlens)
    filename = op.basename(blastfile)
    alignstats = AlignStats(filename, qrycovered, refcovered,
                            qryspan, refspan, identicals, AL50)

    return alignstats
Exemple #7
0
def get_stats(coordsfile):

    from jcvi.utils.range import range_union

    logging.debug("Report stats on `%s`" % coordsfile)
    coords = Coords(coordsfile)
    ref_ivs = []
    qry_ivs = []
    identicals = 0
    alignlen = 0
    alignlens = []

    for c in coords:

        qstart, qstop = c.start2, c.end2
        if qstart > qstop:
            qstart, qstop = qstop, qstart
        qry_ivs.append((c.query, qstart, qstop))

        sstart, sstop = c.start1, c.end1
        if sstart > sstop:
            sstart, sstop = sstop, sstart
        ref_ivs.append((c.ref, sstart, sstop))

        alen = sstop - sstart
        alignlen += alen
        identicals += c.identity / 100. * alen
        alignlens.append(alen)

    qrycovered = range_union(qry_ivs)
    refcovered = range_union(ref_ivs)
    _, AL50, _ = calculate_A50(alignlens)
    filename = op.basename(coordsfile)
    alignstats = AlignStats(filename, qrycovered, refcovered,
                            None, None, identicals)

    return alignstats
Exemple #8
0
def get_stats(coordsfile):

    from jcvi.utils.range import range_union

    logging.debug("Report stats on `%s`" % coordsfile)
    coords = Coords(coordsfile)
    ref_ivs = []
    qry_ivs = []
    identicals = 0
    alignlen = 0
    alignlens = []

    for c in coords:

        qstart, qstop = c.start2, c.end2
        if qstart > qstop:
            qstart, qstop = qstop, qstart
        qry_ivs.append((c.query, qstart, qstop))

        sstart, sstop = c.start1, c.end1
        if sstart > sstop:
            sstart, sstop = sstop, sstart
        ref_ivs.append((c.ref, sstart, sstop))

        alen = sstop - sstart
        alignlen += alen
        identicals += c.identity / 100. * alen
        alignlens.append(alen)

    qrycovered = range_union(qry_ivs)
    refcovered = range_union(ref_ivs)
    _, AL50, _ = calculate_A50(alignlens)
    filename = op.basename(coordsfile)
    alignstats = AlignStats(filename, qrycovered, refcovered, None, None,
                            identicals)

    return alignstats
Exemple #9
0
    def summary(self):
        from jcvi.assembly.base import calculate_A50

        ctgsizes = self.sizes
        a50, l50, n50 = calculate_A50(ctgsizes)
        return sum(ctgsizes), l50, n50
Exemple #10
0
    def summary(self):
        from jcvi.assembly.base import calculate_A50

        ctgsizes = self.sizes
        a50, l50, n50 = calculate_A50(ctgsizes)
        return sum(ctgsizes), l50, n50