def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" sizes.append(int(size)) s = SummaryStats(sizes) print >> sys.stderr, s stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.formats.fasta import Fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" sizes.append(int(size)) s = SummaryStats(sizes) print >> sys.stderr, s stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".format( nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug( "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]". format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".format( percentage(totalassembled, totalreads)))
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".\ format(nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\ format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".\ format(percentage(totalassembled, totalreads)))
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) p.add_option("--csv", help="Write depth per contig to file") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args csv = open(opts.csv, "w") if opts.csv else None f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences if "with" in desc: name, w, size, seqs = desc.split() if csv: print("\t".join(str(x) for x in (name, size, len(rec))), file=csv) assert w == "with" sizes.append(int(size)) # MRD85:00603:02472;size=167; else: name, size, tail = desc.split(";") sizes.append(int(size.replace("size=", ""))) if csv: csv.close() logging.debug("File written to `%s`.", opts.csv) s = SummaryStats(sizes) print(s, file=sys.stderr) stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) p.add_option("--csv", help="Write depth per contig to file") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args csv = open(opts.csv, "w") if opts.csv else None f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences if "with" in desc: name, w, size, seqs = desc.split() if csv: print("\t".join(str(x) for x in (name, size, len(rec))), file=csv) assert w == "with" sizes.append(int(size)) # MRD85:00603:02472;size=167; else: name, size, tail = desc.split(";") sizes.append(int(size.replace("size=", ""))) if csv: csv.close() logging.debug("File written to `{0}`".format(opts.csv)) s = SummaryStats(sizes) print(s, file=sys.stderr) stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")