def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists [default: %default]") p.add_option("--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size [default: %default]") p.add_option("--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution [default: %default]") opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print >>fw, header for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format(\ op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]): print >> fw, "\t".join((str(i), str(s / 1000000.), tag)) fw.close() table = loadtable(statsheader, statsrows) print >> sys.stderr, table generate_plot(rplot)
def rstats(self, object, bacs, components, scaffold_sizes, length): from jcvi.utils.cbook import human_size nbacs = len(bacs) nscaffolds = len(scaffold_sizes) a50, l50, n50 = calculate_A50(scaffold_sizes) l50 = human_size(l50) length = human_size(length) return (object, nbacs, components, nscaffolds, n50, l50, length)
def get_stats(blastfile, strict=False): from jcvi.utils.range import range_union, range_span from .pyblast import BlastLine logging.debug("Report stats on `%s`" % blastfile) fp = open(blastfile) ref_ivs = [] qry_ivs = [] identicals = 0 ngaps = 0 alignlens = [] for row in fp: c = BlastLine(row) qstart, qstop = c.qstart, c.qstop if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.sstart, c.sstop if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.subject, sstart, sstop)) alen = c.hitlen ngaps += c.ngaps identicals += c.hitlen - c.nmismatch - c.ngaps alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) if strict: # We discount gaps in counting covered bases, since we # did not track individually gaps in qry and ref, we assume # the gaps are opened evenly in the two sequences qrycovered -= ngaps / 2 refcovered -= ngaps / 2 qryspan = range_span(qry_ivs) refspan = range_span(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(blastfile) alignstats = AlignStats( filename, qrycovered, refcovered, qryspan, refspan, identicals, AL50 ) return alignstats
def get_stats(blastfile, strict=False): from jcvi.utils.range import range_union, range_span from .pyblast import BlastLine logging.debug("Report stats on `%s`" % blastfile) fp = open(blastfile) ref_ivs = [] qry_ivs = [] identicals = 0 ngaps = 0 alignlens = [] for row in fp: c = BlastLine(row) qstart, qstop = c.qstart, c.qstop if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.sstart, c.sstop if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.subject, sstart, sstop)) alen = c.hitlen ngaps += c.ngaps identicals += c.hitlen - c.nmismatch - c.ngaps alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) if strict: # We discount gaps in counting covered bases, since we # did not track individually gaps in qry and ref, we assume # the gaps are opened evenly in the two sequences qrycovered -= ngaps / 2 refcovered -= ngaps / 2 qryspan = range_span(qry_ivs) refspan = range_span(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(blastfile) alignstats = AlignStats(filename, qrycovered, refcovered, qryspan, refspan, identicals, AL50) return alignstats
def get_stats(coordsfile): from jcvi.utils.range import range_union logging.debug("Report stats on `%s`" % coordsfile) coords = Coords(coordsfile) ref_ivs = [] qry_ivs = [] identicals = 0 alignlen = 0 alignlens = [] for c in coords: qstart, qstop = c.start2, c.end2 if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.start1, c.end1 if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.ref, sstart, sstop)) alen = sstop - sstart alignlen += alen identicals += c.identity / 100. * alen alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(coordsfile) alignstats = AlignStats(filename, qrycovered, refcovered, None, None, identicals) return alignstats
def summary(self): from jcvi.assembly.base import calculate_A50 ctgsizes = self.sizes a50, l50, n50 = calculate_A50(ctgsizes) return sum(ctgsizes), l50, n50