def prepare(args): """ %prog prepare *.fastq Generate run.sh script to run clc_novo_assemble. """ from itertools import groupby from jcvi.utils.iter import grouper from jcvi.formats.base import check_exists from jcvi.assembly.base import FastqNamings, Library p = OptionParser(prepare.__doc__ + FastqNamings) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) singletons = [] pairs = [] write_file("license.properties", CLCLICENSE) for lib, fs in libs: size = lib.size stddev = lib.stddev if size == 0: singletons += fs continue for f in fs: reverse_seq = 0 if ".corr." in f else lib.reverse_seq fb = "bf" if reverse_seq else "fb" minsize, maxsize = size - 2 * stddev, size + 2 * stddev pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize) if ".1." in f: f = f.replace(".1.", ".?.") pairs.append(pair_opt + "-i {0}".format(f)) elif ".2." in f: continue else: pairs.append(pair_opt + f) cmd = "clc_novo_assemble --cpus 32 -o contigs.fasta \\\n" cmd += "\t-q {0} \\\n".format(" ".join(singletons)) cmd += "\n".join("\t{0} \\".format(x) for x in pairs) runfile = "run.sh" if check_exists(runfile): fw = open(runfile, "w") print >> fw, "#!/bin/bash\n" print >> fw, cmd logging.debug("Run script written to `{0}`.".format(runfile))
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. """ from jcvi.utils.iter import grouper from jcvi.formats.base import check_exists p = OptionParser(prepare.__doc__ + FastqNamings) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) cfgfile = "soap.config" fw = open(cfgfile, "w") library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) rank = 0 singletons = [] for lib, fs in libs: size = lib.size if size == 0: singletons = fs continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] reverse_seq = 0 if ".corr." in f else lib.reverse_seq block += "reverse_seq={0}\n".format(reverse_seq) block += "asm_flags={0}\n".format(lib.asm_flags) block += "rank={0}\n".format(rank) if singletons: fs += singletons singletons = [] for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" else: tag = "q" block += "{0}={1}\n".format(tag, f) print >> sys.stderr, block print >> fw, block runfile = "run.sh" if check_exists(runfile): fw = open(runfile, "w") print >> fw, SOAPRUN logging.debug("Run script written to `{0}`.".format(runfile))
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import check_exists p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" if not opts.norun and check_exists(runfile): fw = open(runfile, "w") print >> fw, ALLPATHSRUN logging.debug("Run script written to `{0}`.".format(runfile))