def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'.\ format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def chimera(args): """ %prog chimera bamfile Parse BAM file from `bwasw` and list multi-hit reads and breakpoints. """ import pysam from jcvi.utils.natsort import natsorted p = OptionParser(chimera.__doc__) p.set_verbose() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samfile, = args samfile = pysam.AlignmentFile(samfile) rstore = defaultdict(list) hstore = defaultdict(int) for r in samfile.fetch(): rstore[r.query_name] += list(breakpoint(r)) hstore[r.query_name] += 1 if opts.verbose: print >> sys.stderr, r.query_name, "+-"[r.is_reverse], \ sum(l for o, l in r.cigartuples), r.cigarstring, list(breakpoint(r)) for rn, bps in natsorted(rstore.items()): bps = "|".join(str(x) for x in sorted(bps)) if bps else "na" print "\t".join((rn, str(hstore[rn]), bps))
def cat(args): """ %prog cat *.pdf -o output.pdf Concatenate pages from pdf files into a single pdf file. Page ranges refer to the previously-named file. A file not followed by a page range means all the pages of the file. PAGE RANGES are like Python slices. {page_range_help} EXAMPLES pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 Concatenate all of head.pdf, all but page seven of content.pdf, and the last page of tail.pdf, producing output.pdf. pdfcat chapter*.pdf >book.pdf You can specify the output file by redirection. pdfcat chapter?.pdf chapter10.pdf >book.pdf In case you don't want chapter 10 before chapter 2. """ p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP)) p.add_option("--nosort", default=False, action="store_true", help="Do not sort file names") p.set_outfile() p.set_verbose(help="Show page ranges as they are being read") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) outfile = opts.outfile if outfile in args: args.remove(outfile) if not opts.nosort: args = natsorted(args) filename_page_ranges = parse_filename_page_ranges(args) verbose = opts.verbose fw = must_open(outfile, "wb") merger = PdfFileMerger() in_fs = {} try: for (filename, page_range) in filename_page_ranges: if verbose: print >> sys.stderr, filename, page_range if filename not in in_fs: in_fs[filename] = open(filename, "rb") merger.append(in_fs[filename], pages=page_range) except: print >> sys.stderr, traceback.format_exc() print >> sys.stderr, "Error while reading " + filename sys.exit(1) merger.write(fw) fw.close()
def add_help_from_choices(self, o): from jcvi.utils.natsort import natsorted if o.help == SUPPRESS_HELP: return default_tag = "%default" help_pf = o.help[:1].upper() + o.help[1:] if "[" in help_pf: help_pf = help_pf.rsplit("[", 1)[0] help_pf = help_pf.strip() if o.type == "choice": if o.default is None: default_tag = "guess" ctext = "|".join(natsorted(o.choices)) if len(ctext) > 100: ctext = ctext[:100] + " ... " choice_text = "must be one of {0}".format(ctext) o.help = "{0}, {1} [default: {2}]".format(help_pf, choice_text, default_tag) else: o.help = help_pf if o.default is None: default_tag = "disabled" if o.get_opt_string() != "--help" and o.action != "store_false": o.help += " [default: {0}]".format(default_tag)
def cat(args): """ %prog cat *.pdf -o output.pdf Concatenate pages from pdf files into a single pdf file. Page ranges refer to the previously-named file. A file not followed by a page range means all the pages of the file. PAGE RANGES are like Python slices. {page_range_help} EXAMPLES pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 Concatenate all of head.pdf, all but page seven of content.pdf, and the last page of tail.pdf, producing output.pdf. pdfcat chapter*.pdf >book.pdf You can specify the output file by redirection. pdfcat chapter?.pdf chapter10.pdf >book.pdf In case you don't want chapter 10 before chapter 2. """ p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP)) p.set_outfile() p.set_verbose(help="Show page ranges as they are being read") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) outfile = opts.outfile if outfile in args: args.remove(outfile) args = natsorted(args) filename_page_ranges = parse_filename_page_ranges(args) verbose = opts.verbose fw = must_open(outfile, "wb") merger = PdfFileMerger() in_fs = {} try: for (filename, page_range) in filename_page_ranges: if verbose: print >> sys.stderr, filename, page_range if filename not in in_fs: in_fs[filename] = open(filename, "rb") merger.append(in_fs[filename], pages=page_range) except: print >> sys.stderr, traceback.format_exc() print >> sys.stderr, "Error while reading " + filename sys.exit(1) merger.write(fw) fw.close()
def setop(args): """ %prog setop "fileA & fileB" > newfile Perform set operations, except on files. The files (fileA and fileB) contain list of ids. The operator is one of the four: |: union (elements found in either file) &: intersection (elements found in both) -: difference (elements in fileA but not in fileB) ^: symmetric difference (elementes found in either set but not both) Please quote the argument to avoid shell interpreting | and &. """ from jcvi.utils.natsort import natsorted p = OptionParser(setop.__doc__) p.add_option( "--column", default=0, type="int", help="The column to extract, 0-based, -1 to disable [default: %default]" ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) statement, = args fa, op, fb = statement.split() assert op in ('|', '&', '-', '^') column = opts.column fa = SetFile(fa, column=column) fb = SetFile(fb, column=column) if op == '|': t = fa | fb elif op == '&': t = fa & fb elif op == '-': t = fa - fb elif op == '^': t = fa ^ fb for x in natsorted(t): print x
def setop(args): """ %prog setop "fileA & fileB" > newfile Perform set operations, except on files. The files (fileA and fileB) contain list of ids. The operator is one of the four: |: union (elements found in either file) &: intersection (elements found in both) -: difference (elements in fileA but not in fileB) ^: symmetric difference (elementes found in either set but not both) Please quote the argument to avoid shell interpreting | and &. """ from jcvi.utils.natsort import natsorted p = OptionParser(setop.__doc__) p.add_option("--column", default=0, type="int", help="The column to extract, 0-based, -1 to disable [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) statement, = args fa, op, fb = statement.split() assert op in ('|', '&', '-', '^') column = opts.column fa = SetFile(fa, column=column) fb = SetFile(fb, column=column) if op == '|': t = fa | fb elif op == '&': t = fa & fb elif op == '-': t = fa - fb elif op == '^': t = fa ^ fb for x in natsorted(t): print x
def get_seqstarts(bamfile, N): """ Go through the SQ headers and pull out all sequences with size greater than the resolution settings, i.e. contains at least a few cells """ import pysam bamfile = pysam.AlignmentFile(bamfile, "rb") seqsize = {} for kv in bamfile.header["SQ"]: if kv["LN"] < 10 * N: continue seqsize[kv["SN"]] = kv["LN"] / N + 1 allseqs = natsorted(seqsize.keys()) allseqsizes = np.array([seqsize[x] for x in allseqs]) seqstarts = np.cumsum(allseqsizes) seqstarts = np.roll(seqstarts, 1) total_bins = seqstarts[0] seqstarts[0] = 0 seqstarts = dict(zip(allseqs, seqstarts)) return seqstarts, seqsize, total_bins
def add_help_from_choices(self, o): from jcvi.utils.natsort import natsorted default_tag = "%default" help_pf = o.help[:1].upper() + o.help[1:] if "[" in help_pf: help_pf = help_pf.rsplit("[", 1)[0] help_pf = help_pf.strip() if o.type == "choice": if o.default is None: default_tag = "guess" ctext = "|".join(natsorted(o.choices)) if len(ctext) > 100: ctext = ctext[:100] + " ... " choice_text = "must be one of {0}".format(ctext) o.help = "{0}, {1} [default: {2}]".format(help_pf, choice_text, default_tag) else: o.help = help_pf if o.default is None: default_tag = "disabled" if o.get_opt_string() != "--help" and o.action != "store_false": o.help += " [default: {0}]".format(default_tag)
def score(args): """ %prog score main_results/ cached_data/ contigsfasta Score the current LACHESIS CLM. """ p = OptionParser(score.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) mdir, cdir, contigsfasta = args orderingfiles = natsorted(iglob(mdir, "*.ordering")) sizes = Sizes(contigsfasta) contig_names = list(sizes.iter_names()) contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) oo = [] # Load contact matrix glm = op.join(cdir, "all.GLM") N = len(contig_ids) M = np.zeros((N, N), dtype=int) fp = open(glm) for row in fp: if row[0] == '#': continue x, y, z = row.split() if x == 'X': continue M[int(x), int(y)] = int(z) fwtour = open("tour", "w") def callback(tour, gen, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA-{0}".format(gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness print_tour(fwtour, tour, label, contig_names, oo) return tour for ofile in orderingfiles: co = ContigOrdering(ofile) for x in co: contig_id = contig_ids[x.contig_name] oo.append(contig_id) pf = op.basename(ofile).split(".")[0] print pf print oo tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) # Store INIT tour print_tour(fwtour, tour, "INIT", contig_names, oo) # Faster Cython version for evaluation from .chic import score_evaluate_M callbacki = partial(callback, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, callback=callbacki) print tour, tour.fitness break fwtour.close()
def dotplot(args): """ %prog dotplot map.csv ref.fasta Make dotplot between chromosomes and linkage maps. The input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 """ from jcvi.assembly.allmaps import CSVMapLine from jcvi.formats.sizes import Sizes from jcvi.utils.natsort import natsorted from jcvi.graphics.base import shorten from jcvi.graphics.dotplot import plt, savefig, markup, normalize_axes, \ downsample, plot_breaks_and_labels, thousands p = OptionParser(dotplot.__doc__) p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 2: sys.exit(not p.print_help()) csvfile, fastafile = args sizes = natsorted(Sizes(fastafile).mapping.items()) seen = set() raw_data = [] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot fp = must_open(csvfile) for row in fp: m = CSVMapLine(row) seen.add(m.seqid) raw_data.append(m) # X-axis is the genome assembly ctgs, ctg_sizes = zip(*sizes) xsize = sum(ctg_sizes) qb = list(np.cumsum(ctg_sizes)) qbreaks = list(zip(ctgs, [0] + qb, qb)) qstarts = dict(zip(ctgs, [0] + qb)) # Y-axis is the map key = lambda x: x.lg raw_data.sort(key=key) ssizes = {} for lg, d in groupby(raw_data, key=key): ssizes[lg] = max([x.cm for x in d]) ssizes = natsorted(ssizes.items()) lgs, lg_sizes = zip(*ssizes) ysize = sum(lg_sizes) sb = list(np.cumsum(lg_sizes)) sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb)) sstarts = dict(zip(lgs, [0] + sb)) # Re-code all the scatter dots data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, 'g') \ for x in raw_data if (x.seqid in qstarts)] npairs = downsample(data) x, y, c = zip(*data) ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) # Flip X-Y label gy, gx = op.basename(csvfile).split(".")[:2] gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30) xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks) ax.set_xlim(xlim) ax.set_ylim(ylim) title = "Alignment: {} vs {}".format(gx, gy) title += " ({} markers)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) normalize_axes(root) image_name = opts.outfile or \ (csvfile.rsplit(".", 1)[0] + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()