Ejemplo n.º 1
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Ejemplo n.º 2
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Ejemplo n.º 3
0
def count(args):
    """
    %prog count *.gz

    Count reads based on FASTQC results. FASTQC needs to be run on all the input
    data given before running this command.
    """
    from jcvi.utils.table import loadtable, write_csv

    p = OptionParser(count.__doc__)
    p.add_option("--dir",
                 help="Sub-directory where FASTQC was run [default: %default]")
    p.add_option("--human",
                 default=False,
                 action="store_true",
                 help="Human friendly numbers [default: %default]")
    p.set_table()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    filenames = args
    subdir = opts.dir
    header = "Filename|Total Sequences|Sequence length|Total Bases".split("|")
    rows = []
    human = opts.human
    for f in filenames:
        folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc"
        if subdir:
            folder = op.join(subdir, folder)
        summaryfile = op.join(folder, "fastqc_data.txt")

        fqcdata = FastQCdata(summaryfile, human=human)
        row = [fqcdata[x] for x in header]
        rows.append(row)

    print >> sys.stderr, loadtable(header, rows)
    write_csv(header,
              rows,
              sep=opts.sep,
              filename=opts.outfile,
              align=opts.align)
Ejemplo n.º 4
0
def count(args):
    """
    %prog count *.gz

    Count reads based on FASTQC results. FASTQC needs to be run on all the input
    data given before running this command.
    """
    from jcvi.utils.table import loadtable

    p = OptionParser(count.__doc__)
    p.add_option("--dir",
                help="Sub-directory where FASTQC was run [default: %default]")
    p.add_option("--human", default=False, action="store_true",
                help="Human friendly numbers [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    filenames = args
    subdir = opts.dir
    header = "Filename|Total Sequences|Sequence length|Total Bases".split("|")
    rows = []
    human = opts.human
    for f in filenames:
        folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc"
        if subdir:
            folder = op.join(subdir, folder)
        summaryfile = op.join(folder, "fastqc_data.txt")

        fqcdata = FastQCdata(summaryfile, human=human)
        row = [fqcdata[x] for x in header]
        rows.append(row)

    print >> sys.stderr, loadtable(header, rows)

    fw = must_open(opts.outfile, "w")
    data = [header] + rows
    for d in data:
        print >> fw, ",".join(str(x) for x in d)