コード例 #1
0
ファイル: tgbs.py プロジェクト: biologyguy/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        sizes.append(int(size))

    s = SummaryStats(sizes)
    print >> sys.stderr, s
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #2
0
ファイル: cdhit.py プロジェクト: kvefimov/jcvi_062915
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
コード例 #3
0
ファイル: tgbs.py プロジェクト: zachary-zzc/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.formats.fasta import Fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        sizes.append(int(size))

    s = SummaryStats(sizes)
    print >> sys.stderr, s
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #4
0
ファイル: cdhit.py プロジェクト: biologyguy/jcvi
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
コード例 #5
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=2,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".format(
            nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug(
            "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".
            format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".format(
        percentage(totalassembled, totalreads)))
コード例 #6
0
ファイル: cdhit.py プロジェクト: tanghaibao/jcvi
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=2, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".\
                       format(nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\
                       format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".\
                  format(percentage(totalassembled, totalreads)))
コード例 #7
0
ファイル: tgbs.py プロジェクト: zjwang6/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    p.add_option("--csv", help="Write depth per contig to file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    csv = open(opts.csv, "w") if opts.csv else None

    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue

        # consensus_for_cluster_0 with 63 sequences
        if "with" in desc:
            name, w, size, seqs = desc.split()
            if csv:
                print("\t".join(str(x) for x in (name, size, len(rec))),
                      file=csv)
            assert w == "with"
            sizes.append(int(size))
        # MRD85:00603:02472;size=167;
        else:
            name, size, tail = desc.split(";")
            sizes.append(int(size.replace("size=", "")))

    if csv:
        csv.close()
        logging.debug("File written to `%s`.", opts.csv)

    s = SummaryStats(sizes)
    print(s, file=sys.stderr)
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #8
0
ファイル: tgbs.py プロジェクト: tanghaibao/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    p.add_option("--csv", help="Write depth per contig to file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    csv = open(opts.csv, "w") if opts.csv else None

    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue

        # consensus_for_cluster_0 with 63 sequences
        if "with" in desc:
            name, w, size, seqs = desc.split()
            if csv:
                print("\t".join(str(x)
                                for x in (name, size, len(rec))), file=csv)
            assert w == "with"
            sizes.append(int(size))
        # MRD85:00603:02472;size=167;
        else:
            name, size, tail = desc.split(";")
            sizes.append(int(size.replace("size=", "")))

    if csv:
        csv.close()
        logging.debug("File written to `{0}`".format(opts.csv))

    s = SummaryStats(sizes)
    print(s, file=sys.stderr)
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")