Example #1
0
def report(args):
    """
    %prog report [--options] ace_file > report

    Prepare a report of read location, consensus location or quality segment per contig
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(report.__doc__)

    types = {"read":      ["padded_start", "padded_end", "orient"],
             "consensus": ["padded_consensus_start", "padded_consensus_end"],
             "quality"  : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"]
            }
    valid_types = tuple(types.keys())
    p.add_option("--type", default="read", choices=valid_types,
            help="choose report type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    for c in ace.contigs:
        print c.name
        table = dict()
        if opts.type == "read":
            ps, pe = [], []
            ps = [read.padded_start for read in c.af]
            for i in xrange(1, len(ps)):
                pe.append(ps[i] - ps[i-1])
            pe.append(c.nbases)
            map = dict(zip(ps, pe))
            for i, read in enumerate(c.af):
                values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "consensus":
            for read in c.bs:
                values = [str(x) for x in (read.padded_start, read.padded_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "quality":
            for read in c.reads:
                (r1, r2) = (read.rd, read.qa)
                values = [str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(r1.name), label)] = values[i]
        print tabulate(table), "\n"
Example #2
0
File: ace.py Project: rrane/jcvi
def report(args):
    """
    %prog report [--options] ace_file > report

    Prepare a report of read location, consensus location or quality segment per contig
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(report.__doc__)

    types = {"read":      ["padded_start", "padded_end", "orient"],
             "consensus": ["padded_consensus_start", "padded_consensus_end"],
             "quality"  : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"]
            }
    valid_types = tuple(types.keys())
    p.add_option("--type", default="read", choices=valid_types,
            help="choose report type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    for c in ace.contigs:
        print c.name
        table = dict()
        if opts.type == "read":
            ps, pe = [], []
            ps = [read.padded_start for read in c.af]
            for i in xrange(1, len(ps)):
                pe.append(ps[i] - ps[i-1])
            pe.append(c.nbases)
            map = dict(zip(ps, pe))
            for i, read in enumerate(c.af):
                values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "consensus":
            for read in c.bs:
                values = [str(x) for x in (read.padded_start, read.padded_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "quality":
            for read in c.reads:
                (r1, r2) = (read.rd, read.qa)
                values = [str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(r1.name), label)] = values[i]
        print tabulate(table), "\n"
Example #3
0
def summary(args):
    """
    %prog summary input.bed scaffolds.fasta

    Print out summary statistics per map, followed by consensus summary of
    scaffold anchoring based on multiple maps.
    """
    p = OptionParser(summary.__doc__)
    p.set_table(sep="|", align=True)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    sep = opts.sep
    align = opts.align
    cc = Map(mapbed)
    mapnames = cc.mapnames
    s = Sizes(scaffolds)
    total, l50, n50 = s.summary
    r = {}
    maps = []

    fw = must_open(opts.outfile, "w")
    print >> fw, "*** Summary for each individual map ***"
    for mapname in mapnames:
        markers = [x for x in cc if x.mapname == mapname]
        ms = MapSummary(markers, l50, s)
        r["Linkage Groups", mapname] = ms.num_lgs
        ms.export_table(r, mapname, total)
        maps.append(ms)
    print >> fw, tabulate(r, sep=sep, align=align)

    r = {}
    agp = AGP(chr_agp)
    print >> fw, "*** Summary for consensus map ***"
    consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap)
    oriented_scaffolds = set(x.component_id for x in agp \
                            if (not x.is_gap) and x.orientation != '?')
    unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds

    for mapname, sc in (("Anchored", consensus_scaffolds),
                    ("Oriented", oriented_scaffolds),
                    ("Unplaced", unplaced_scaffolds)):
        markers = [x for x in cc if x.seqid in sc]
        ms = MapSummary(markers, l50, s, scaffolds=sc)
        ms.export_table(r, mapname, total)
    print >> fw, tabulate(r, sep=sep, align=align)
Example #4
0
def summary(args):
    """
    %prog summary input.bed scaffolds.fasta

    Print out summary statistics per map, followed by consensus summary of
    scaffold anchoring based on multiple maps.
    """
    p = OptionParser(summary.__doc__)
    p.set_table(sep="|", align=True)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    sep = opts.sep
    align = opts.align
    cc = Map(mapbed)
    mapnames = cc.mapnames
    s = Sizes(scaffolds)
    total, l50, n50 = s.summary
    r = {}
    maps = []

    fw = must_open(opts.outfile, "w")
    print >> fw, "*** Summary for each individual map ***"
    for mapname in mapnames:
        markers = [x for x in cc if x.mapname == mapname]
        ms = MapSummary(markers, l50, s)
        r["Linkage Groups", mapname] = ms.num_lgs
        ms.export_table(r, mapname, total)
        maps.append(ms)
    print >> fw, tabulate(r, sep=sep, align=align)

    r = {}
    agp = AGP(chr_agp)
    print >> fw, "*** Summary for consensus map ***"
    consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap)
    unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds

    for mapname, sc in (("Anchored", consensus_scaffolds),
                        ("Unplaced", unplaced_scaffolds)):
        markers = [x for x in cc if x.seqid in sc]
        ms = MapSummary(markers, l50, s, scaffolds=sc)
        ms.export_table(r, mapname, total)
    print >> fw, tabulate(r, sep=sep, align=align)
Example #5
0
def allstats(args):
    """
    %prog allstats fastafiles

    Summarize multiple FASTA in a table.
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(allstats.__doc__)
    p.add_option("--exclude", help="Exclude statistics, must be {0}, "
                      "multiple separated by comma [default: %default]".\
                      format("|".join(header))
                 )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    exclude = opts.exclude.split(",")
    assert all(x in header for x in exclude)

    tabledict = {}
    for fastafile in fastafiles:
        pf = fastafile.rsplit(".", 1)[0]
        for key, val in n50([fastafile]):
            if key in exclude:
                continue
            tabledict[(pf, key)] = val

    table = tabulate(tabledict)
    print >> sys.stderr, table
Example #6
0
def test_tabulate():
    from jcvi.utils.table import tabulate

    data = {(1, "a"): 3, (1, "b"): 4, (2, "a"): 5, (2, "b"): 0}
    assert (tabulate(data) == """===========
o    a    b
-----------
1    3    4
2    5    0
-----------""")
    assert (tabulate(data, transpose=True) == """===========
o    1    2
-----------
a    3    5
b    4    0
-----------""")
Example #7
0
def summary(args):
    """
    %prog summary *.gff

    Print gene statistics table.
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    for metric in metrics:
        logging.debug("Parsing files in `{0}`..".format(metric))

        table = {}
        for x in gff_files:
            pf = op.basename(x).split(".")[0]
            numberfile = op.join(metric, pf + ".txt")
            ar = [int(x.strip()) for x in open(numberfile)]
            sum = SummaryStats(ar).todict().items()
            keys, vals = zip(*sum)
            keys = [(pf, x) for x in keys]
            table.update(dict(zip(keys, vals)))

        print >> sys.stderr, tabulate(table)
Example #8
0
def summary(args):
    """
    %prog summary *.gff

    Print gene statistics table.
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    for metric in metrics:
        logging.debug("Parsing files in `{0}`..".format(metric))

        table = {}
        for x in gff_files:
            pf = op.basename(x).split(".")[0]
            numberfile = op.join(metric, pf + ".txt")
            ar = [int(x.strip()) for x in open(numberfile)]
            sum = SummaryStats(ar).todict().items()
            keys, vals = zip(*sum)
            keys = [(pf, x) for x in keys]
            table.update(dict(zip(keys, vals)))

        print >> sys.stderr, tabulate(table)
Example #9
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
Example #10
0
File: ca.py Project: rrane/jcvi
def script(args):
    """
    %prog script bfs_rfs libs

    `bfs_rfs` contains the joined results from Brian's `classifyMates`. We want
    to keep the RFS result (but not in the BFS result) to retain actual MP. Libs
    contain a list of lib iids, use comma to separate, e.g. "9,10,11".
    """
    p = OptionParser(script.__doc__)

    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(p.print_help())

    fsfile, libs = args
    libs = [int(x) for x in libs.split(",")]
    fp = open(fsfile)
    not_found = ("limited", "exhausted")
    counts = defaultdict(int)
    pe, mp = 0, 0
    both, noidea = 0, 0
    total = 0

    for i in libs:
        print "lib iid {0} allfragsunmated 1".format(i)

    for row in fp:
        frgiid, bfs, rfs = row.split()
        bfs = (bfs not in not_found)
        rfs = (rfs not in not_found)
        if bfs and (not rfs):
            pe += 1
        if rfs and (not bfs):
            mp += 1
            frgiid = int(frgiid)
            mateiid = frgiid + 1
            print "frg iid {0} mateiid {1}".format(frgiid, mateiid)
            print "frg iid {0} mateiid {1}".format(mateiid, frgiid)
        if bfs and rfs:
            both += 1
        if (not bfs) and (not rfs):
            noidea += 1
        total += 1

    assert pe + mp + both + noidea == total
    counts[("PE", "N")] = pe
    counts[("MP", "N")] = mp
    counts[("Both", "N")] = both
    counts[("No Idea", "N")] = noidea

    table = tabulate(counts)
    func = lambda a: a * 100. / total
    table = table.withNewColumn("Percentage", callback=func,
            columns=("N",), digits=2)
    print >> sys.stderr, table
Example #11
0
def script(args):
    """
    %prog script bfs_rfs libs

    `bfs_rfs` contains the joined results from Brian's `classifyMates`. We want
    to keep the RFS result (but not in the BFS result) to retain actual MP. Libs
    contain a list of lib iids, use comma to separate, e.g. "9,10,11".
    """
    p = OptionParser(script.__doc__)

    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(p.print_help())

    fsfile, libs = args
    libs = [int(x) for x in libs.split(",")]
    fp = open(fsfile)
    not_found = ("limited", "exhausted")
    counts = defaultdict(int)
    pe, mp = 0, 0
    both, noidea = 0, 0
    total = 0

    for i in libs:
        print "lib iid {0} allfragsunmated 1".format(i)

    for row in fp:
        frgiid, bfs, rfs = row.split()
        bfs = (bfs not in not_found)
        rfs = (rfs not in not_found)
        if bfs and (not rfs):
            pe += 1
        if rfs and (not bfs):
            mp += 1
            frgiid = int(frgiid)
            mateiid = frgiid + 1
            print "frg iid {0} mateiid {1}".format(frgiid, mateiid)
            print "frg iid {0} mateiid {1}".format(mateiid, frgiid)
        if bfs and rfs:
            both += 1
        if (not bfs) and (not rfs):
            noidea += 1
        total += 1

    assert pe + mp + both + noidea == total
    counts[("PE", "N")] = pe
    counts[("MP", "N")] = mp
    counts[("Both", "N")] = both
    counts[("No Idea", "N")] = noidea

    table = tabulate(counts)
    func = lambda a: a * 100. / total
    table = table.withNewColumn("Percentage", callback=func,
            columns=("N",), digits=2)
    print >> sys.stderr, table
Example #12
0
def stats(args):
    """
    %prog stats agpfile

    Print out a report for length of gaps and components.
    """
    p = OptionParser(stats.__doc__)
    p.add_option("--warn",
                 default=False,
                 action="store_true",
                 help="Warnings on small component spans [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    agpfile, = args

    agp = AGP(agpfile)
    gap_lengths = []
    component_lengths = []
    for a in agp:
        span = a.object_span
        if a.is_gap:
            label = a.gap_type
            gap_lengths.append((span, label))
        else:
            label = "{0}:{1}-{2}".format(a.component_id, a.component_beg, \
                   a.component_end)
            component_lengths.append((span, label))
            if opts.warn and span < 50:
                logging.error("component span too small ({0}):\n{1}".\
                    format(span, a))

    table = dict()
    for label, lengths in zip(("Gaps", "Components"),
                              (gap_lengths, component_lengths)):

        if not lengths:
            table[(label, "Min")] = table[(label, "Max")] \
                                  = table[(label, "Sum")] = "n.a."
            continue

        table[(label, "Min")] = "{0} ({1})".format(*min(lengths))
        table[(label, "Max")] = "{0} ({1})".format(*max(lengths))
        table[(label, "Sum")] = sum(x[0] for x in lengths)

    from jcvi.utils.table import tabulate

    table = tabulate(table)
    print >> sys.stderr, table
Example #13
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
Example #14
0
File: bed.py Project: bennyyu/jcvi
    def __str__(self):
        from jcvi.utils.table import tabulate

        table = {}
        table[("Prediction-True", "Reality-True")] = self.TP
        table[("Prediction-True", "Reality-False")] = self.FP
        table[("Prediction-False", "Reality-True")] = self.FN
        table[("Prediction-False", "Reality-False")] = self.TN
        msg = str(tabulate(table))

        msg += "\nSensitivity [TP / (TP + FN)]: {0:.1f} %\n".format(self.sensitivity * 100)
        msg += "Specificity [TP / (TP + FP)]: {0:.1f} %\n".format(self.specificity * 100)
        msg += "Accuracy [(TP + TN) / (TP + FP + FN + TN)]: {0:.1f} %".format(self.accuracy * 100)
        return msg
Example #15
0
File: agp.py Project: bennyyu/jcvi
def stats(args):
    """
    %prog stats agpfile

    Print out a report for length of gaps and components.
    """
    p = OptionParser(stats.__doc__)
    p.add_option("--warn", default=False, action="store_true",
                 help="Warnings on small component spans [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    agpfile, = args

    agp = AGP(agpfile)
    gap_lengths = []
    component_lengths = []
    for a in agp:
        span = a.object_span
        if a.is_gap:
            label = a.gap_type
            gap_lengths.append((span, label))
        else:
            label = "{0}:{1}-{2}".format(a.component_id, a.component_beg, \
                   a.component_end)
            component_lengths.append((span, label))
            if opts.warn and span < 50:
                logging.error("component span too small ({0}):\n{1}".\
                    format(span, a))

    table = dict()
    for label, lengths in zip(("Gaps", "Components"),
            (gap_lengths, component_lengths)):

        if not lengths:
            table[(label, "Min")] = table[(label, "Max")] \
                                  = table[(label, "Sum")] = "n.a."
            continue

        table[(label, "Min")] = "{0} ({1})".format(*min(lengths))
        table[(label, "Max")] = "{0} ({1})".format(*max(lengths))
        table[(label, "Sum")] = sum(x[0] for x in lengths)

    from jcvi.utils.table import tabulate

    table = tabulate(table)
    print >> sys.stderr, table
Example #16
0
File: bed.py Project: radaniba/jcvi
    def __str__(self):
        from jcvi.utils.table import tabulate

        table = {}
        table[("Prediction-True", "Reality-True")] = self.TP
        table[("Prediction-True", "Reality-False")] = self.FP
        table[("Prediction-False", "Reality-True")] = self.FN
        table[("Prediction-False", "Reality-False")] = self.TN
        msg = str(tabulate(table))

        msg += "\nSensitivity [TP / (TP + FN)]: {0:.1f} %\n".\
                format(self.sensitivity * 100)
        msg += "Specificity [TP / (TP + FP)]: {0:.1f} %\n".\
                format(self.specificity * 100)
        msg += "Accuracy [(TP + TN) / (TP + FP + FN + TN)]: {0:.1f} %".\
                format(self.accuracy * 100)
        return msg
Example #17
0
def genestats(args):
    """
    %prog genestats gffile

    Print summary stats, including:
    - Number of genes
    - Number of single-exon genes
    - Number of multi-exon genes
    - Number of distinct exons
    - Number of genes with alternative transcript variants
    - Number of predicted transcripts
    - Mean number of distinct exons per gene
    - Mean number of transcripts per gene
    - Mean gene locus size (first to last exon)
    - Mean transcript size (UTR, CDS)
    - Mean exon size

    Stats modeled after barley genome paper Table 1.
    A physical, genetic and functional sequence assembly of the barley genome
    """
    p = OptionParser(genestats.__doc__)
    p.add_option("--groupby", default="conf_class",
                 help="Print separate stats groupby")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print >> fw, "\t".join((fid, str(tsize), conf_class))
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())
    for g in distinct_groups:
        num_genes = num_single_exon_genes = num_multi_exon_genes = 0
        num_genes_with_alts = num_transcripts = num_exons = 0
        cum_locus_size = cum_transcript_size = cum_exon_size = 0
        for gs in genes:
            if gs.conf_class != g:
                continue
            num_genes += 1
            if gs.num_exons == 1:
                num_single_exon_genes += 1
            else:
                num_multi_exon_genes += 1
            num_exons += gs.num_exons
            if gs.num_transcripts > 1:
                num_genes_with_alts += 1
            num_transcripts += gs.num_transcripts
            cum_locus_size += gs.locus_size
            cum_transcript_size += gs.cum_transcript_size
            cum_exon_size += gs.cum_exon_size

        mean_num_exons = num_exons * 1. / num_genes
        mean_num_transcripts = num_transcripts * 1. / num_genes
        mean_locus_size = cum_locus_size * 1. / num_genes
        mean_transcript_size = cum_transcript_size * 1. / num_transcripts
        mean_exon_size = cum_exon_size * 1. / num_exons

        r[("Number of genes", g)] = num_genes
        r[("Number of single-exon genes", g)] = \
            percentage(num_single_exon_genes, num_genes, mode=1)
        r[("Number of multi-exon genes", g)] = \
            percentage(num_multi_exon_genes, num_genes, mode=1)
        r[("Number of distinct exons", g)] = num_exons
        r[("Number of genes with alternative transcript variants", g)] = \
            percentage(num_genes_with_alts, num_genes, mode=1)
        r[("Number of predicted transcripts", g)] = num_transcripts
        r[("Mean number of distinct exons per gene", g)] = mean_num_exons
        r[("Mean number of transcripts per gene", g)] = mean_num_transcripts
        r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size
        r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size
        r[("Mean exon size", g)] = mean_exon_size

    print >> sys.stderr, tabulate(r)
Example #18
0
File: vcf.py Project: Hensonmw/jcvi
def summary(args):
    """
    %prog summary txtfile fastafile

    The txtfile can be generated by: %prog mstmap --noheader --freq=0

    Tabulate on all possible combinations of genotypes and provide results
    in a nicely-formatted table. Give a fastafile for SNP rate (average
    # of SNPs per Kb).

    Only three-column file is supported:
    locus_id    intra- genotype    inter- genotype
    """
    from jcvi.utils.cbook import thousands
    from jcvi.utils.table import tabulate

    p = OptionParser(summary.__doc__)
    p.add_option("--counts",
                 help="Print SNP counts in a txt file [default: %default]")
    p.add_option("--bed",
                 help="Print SNPs locations in a bed file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    txtfile, fastafile = args
    bedfw = open(opts.bed, "w") if opts.bed else None

    fp = open(txtfile)
    header = fp.next().split()  # Header
    snps = defaultdict(list)  # contig => list of loci
    combinations = defaultdict(int)
    intraSNPs = interSNPs = 0
    distinctSet = set()  # set of genes that show A-B pattern
    ref, alt = header[1:3]
    snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int)
    for row in fp:
        atoms = row.split()
        assert len(atoms) == 3, \
                "Only three-column file is supported"
        locus, intra, inter = atoms
        ctg, pos = locus.rsplit(".", 1)
        pos = int(pos)
        snps[ctg].append(pos)
        snpcounts[ctg] += 1

        if intra == 'X':
            intraSNPs += 1
        if inter in ('B', 'X'):
            interSNPs += 1
        if intra == 'A' and inter == 'B':
            distinctSet.add(ctg)
            goodsnpcounts[ctg] += 1
        # Tabulate all possible combinations
        intra = ref + "-" + intra
        inter = alt + "-" + inter
        combinations[(intra, inter)] += 1

        if bedfw:
            print >> bedfw, "\t".join(str(x) for x in \
                        (ctg, pos - 1, pos, locus))

    if bedfw:
        logging.debug("SNP locations written to `{0}`.".format(opts.bed))
        bedfw.close()

    nsites = sum(len(x) for x in snps.values())
    sizes = Sizes(fastafile)
    bpsize = sizes.totalsize
    snprate = lambda a: a * 1000. / bpsize
    m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\
                format(fastafile, len(sizes), thousands(bpsize))
    m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\
                format(nsites, len(snps),
                       thousands(sum(sizes.mapping[x] for x in snps.keys())))
    m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites))
    m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\
                format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs))
    print >> sys.stderr, m
    print >> sys.stderr, tabulate(combinations)

    leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous"
    print >> sys.stderr, leg

    tag = (ref + "-A", alt + "-B")
    distinctSNPs = combinations[tag]
    tag = str(tag).replace("'", "")
    print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\
                format(distinctSNPs, tag, len(distinctSet))

    if not opts.counts:
        return

    snpcountsfile = opts.counts
    fw = open(snpcountsfile, "w")
    header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP"))
    print >> fw, header

    assert sum(snpcounts.values()) == nsites
    assert sum(goodsnpcounts.values()) == distinctSNPs

    for ctg in sorted(snps.keys()):
        snpcount = snpcounts[ctg]
        goodsnpcount = goodsnpcounts[ctg]
        print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount))

    fw.close()
    logging.debug("SNP counts per contig is written to `{0}`.".\
                  format(snpcountsfile))
Example #19
0
File: vcf.py Project: radaniba/jcvi
def summary(args):
    """
    %prog summary txtfile fastafile

    The txtfile can be generated by: %prog mstmap --noheader --freq=0

    Tabulate on all possible combinations of genotypes and provide results
    in a nicely-formatted table. Give a fastafile for SNP rate (average
    # of SNPs per Kb).

    Only three-column file is supported:
    locus_id    intra- genotype    inter- genotype
    """
    from jcvi.utils.cbook import thousands
    from jcvi.utils.table import tabulate

    p = OptionParser(summary.__doc__)
    p.add_option("--counts",
                 help="Print SNP counts in a txt file [default: %default]")
    p.add_option("--bed",
                 help="Print SNPs locations in a bed file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    txtfile, fastafile = args
    bedfw = open(opts.bed, "w") if opts.bed else None

    fp = open(txtfile)
    header = fp.next().split()  # Header
    snps = defaultdict(list)  # contig => list of loci
    combinations = defaultdict(int)
    intraSNPs = interSNPs = 0
    distinctSet = set()  # set of genes that show A-B pattern
    ref, alt = header[1:3]
    snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int)
    for row in fp:
        atoms = row.split()
        assert len(atoms) == 3, \
                "Only three-column file is supported"
        locus, intra, inter = atoms
        ctg, pos = locus.rsplit(".", 1)
        pos = int(pos)
        snps[ctg].append(pos)
        snpcounts[ctg] += 1

        if intra == 'X':
            intraSNPs += 1
        if inter in ('B', 'X'):
            interSNPs += 1
        if intra == 'A' and inter == 'B':
            distinctSet.add(ctg)
            goodsnpcounts[ctg] += 1
        # Tabulate all possible combinations
        intra = ref + "-" + intra
        inter = alt + "-" + inter
        combinations[(intra, inter)] += 1

        if bedfw:
            print >> bedfw, "\t".join(str(x) for x in \
                        (ctg, pos - 1, pos, locus))

    if bedfw:
        logging.debug("SNP locations written to `{0}`.".format(opts.bed))
        bedfw.close()

    nsites = sum(len(x) for x in snps.values())
    sizes = Sizes(fastafile)
    bpsize = sizes.totalsize
    snprate = lambda a: a * 1000. / bpsize
    m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\
                format(fastafile, len(sizes), thousands(bpsize))
    m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\
                format(nsites, len(snps),
                       thousands(sum(sizes.mapping[x] for x in snps.keys())))
    m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites))
    m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\
                format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs))
    print >> sys.stderr, m
    print >> sys.stderr, tabulate(combinations)

    leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous"
    print >> sys.stderr, leg

    tag = (ref + "-A", alt + "-B")
    distinctSNPs = combinations[tag]
    tag = str(tag).replace("'", "")
    print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\
                format(distinctSNPs, tag, len(distinctSet))

    if not opts.counts:
        return

    snpcountsfile = opts.counts
    fw = open(snpcountsfile, "w")
    header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP"))
    print >> fw, header

    assert sum(snpcounts.values()) == nsites
    assert sum(goodsnpcounts.values()) == distinctSNPs

    for ctg in sorted(snps.keys()):
        snpcount = snpcounts[ctg]
        goodsnpcount = goodsnpcounts[ctg]
        print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount))

    fw.close()
    logging.debug("SNP counts per contig is written to `{0}`.".\
                  format(snpcountsfile))
Example #20
0
def genestats(args):
    """
    %prog genestats gffile

    Print summary stats, including:
    - Number of genes
    - Number of single-exon genes
    - Number of multi-exon genes
    - Number of distinct exons
    - Number of genes with alternative transcript variants
    - Number of predicted transcripts
    - Mean number of distinct exons per gene
    - Mean number of transcripts per gene
    - Mean gene locus size (first to last exon)
    - Mean transcript size (UTR, CDS)
    - Mean exon size

    Stats modeled after barley genome paper Table 1.
    A physical, genetic and functional sequence assembly of the barley genome
    """
    p = OptionParser(genestats.__doc__)
    p.add_option("--groupby",
                 default="conf_class",
                 help="Print separate stats groupby")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print >> fw, "\t".join((fid, str(tsize), conf_class))
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())
    for g in distinct_groups:
        num_genes = num_single_exon_genes = num_multi_exon_genes = 0
        num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0
        cum_locus_size = cum_transcript_size = cum_exon_size = 0
        for gs in genes:
            if gs.conf_class != g:
                continue
            num_genes += 1
            if gs.num_exons == 1:
                num_single_exon_genes += 1
            else:
                num_multi_exon_genes += 1
            num_exons += gs.num_exons
            if gs.num_transcripts > 1:
                num_genes_with_alts += 1
            if gs.num_transcripts > max_transcripts:
                max_transcripts = gs.num_transcripts
            num_transcripts += gs.num_transcripts
            cum_locus_size += gs.locus_size
            cum_transcript_size += gs.cum_transcript_size
            cum_exon_size += gs.cum_exon_size

        mean_num_exons = num_exons * 1. / num_genes
        mean_num_transcripts = num_transcripts * 1. / num_genes
        mean_locus_size = cum_locus_size * 1. / num_genes
        mean_transcript_size = cum_transcript_size * 1. / num_transcripts
        mean_exon_size = cum_exon_size * 1. / num_exons

        r[("Number of genes", g)] = num_genes
        r[("Number of single-exon genes", g)] = \
            percentage(num_single_exon_genes, num_genes, mode=1)
        r[("Number of multi-exon genes", g)] = \
            percentage(num_multi_exon_genes, num_genes, mode=1)
        r[("Number of distinct exons", g)] = num_exons
        r[("Number of genes with alternative transcript variants", g)] = \
            percentage(num_genes_with_alts, num_genes, mode=1)
        r[("Number of predicted transcripts", g)] = num_transcripts
        r[("Mean number of distinct exons per gene", g)] = mean_num_exons
        r[("Mean number of transcripts per gene", g)] = mean_num_transcripts
        r[("Max number of transcripts per gene", g)] = max_transcripts
        r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size
        r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size
        r[("Mean exon size", g)] = mean_exon_size

    fw = must_open(opts.outfile, "w")
    print >> fw, tabulate(r)
    fw.close()