def prepare(args): """ %prog prepare genomesize *.fastq Prepare MERACULOUS configuation file. Genome size should be entered in Mb. """ p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=51, type="int", help="K-mer size") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) genomesize = float(args[0]) / 1000 fnames = args[1:] for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) s = comment_banner("Meraculous params file") + "\n" s += comment_banner("Basic parameters") + "\n" s += "# Describe the libraries ( one line per library )\n" s += "# " + " ".join(header.split()) + "\n" libs = get_libs(fnames) lib_seqs = [] rank = 0 for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 library_name = lib.library_name name = library_name.replace("-", "") wildcard = "{0}*.1.*,{0}*.2.*".format(library_name) rl = max(readlen([x]) for x in fs) lib_seq = lib.get_lib_seq(wildcard, name, rl, rank) lib_seqs.append(lib_seq) s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n" params = [("genome_size", genomesize), ("is_diploid", 0), ("mer_size", opts.K), ("num_prefix_blocks", 1), ("no_read_validation", 0), ("local_num_procs", opts.cpus)] s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n" cfgfile = "meraculous.config" write_file(cfgfile, s, tee=True) s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\ .format(cfgfile) runsh = "run.sh" write_file(runsh, s)
def write_libraries(fastqs, aligner=None): from jcvi.assembly.base import get_libs libs = get_libs(fastqs) assert libs libtxt = "libraries.txt" contents = [] for i, (lib, fns) in enumerate(libs): fns = " ".join(fns) pe = "RF" if lib.read_orientation == "outward" else "FR" cc = ["lib{0}".format(i + 1), fns, lib.size, 0.75, pe] if aligner: cc.insert(1, aligner) libline = " ".join(str(x) for x in cc) contents.append(libline) write_file(libtxt, "\n".join(contents), tee=True) return libtxt
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help="Assemble the first rank only, other libs asm_flags=2 [default: %default]", ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print >> stream, block # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: block += "q={0}\n".format(s) print >>sys.stderr, block print >> fw, block if asm_flags > 2: print >> fw_gc, block runfile = "run.sh" scaffold = opts.scaffold header = SOAPHEADER.format(opts.cpus, opts.K) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template, meta="run script") fw.close() fw_gc.close()
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help= "Assemble the first rank only, other libs asm_flags=2 [default: %default]" ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args K = opts.K for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print(block, file=stream) # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: tag = "q" if is_fastq(s) else "f" block += tag + "={0}\n".format(s) print(block, file=sys.stderr) print(block, file=fw) if asm_flags > 2: print(block, file=fw_gc) runfile = "run.sh" scaffold = opts.scaffold bb = 63 if K <= 63 else 127 binary = "SOAPdenovo-{0}mer".format(bb) header = SOAPHEADER.format(opts.cpus, K, binary) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template) fw.close() fw_gc.close()