def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.add_option("-c", default=2, type="int", help="Maximal value of a counter") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k mm = MakeManager() for p, pf in iter_project(folder): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{} -cs{}".format(K, opts.cpus, opts.c) cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Exclude kmers with less than ci counts") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--cx", default=None, type="int", help="Exclude kmers with more than cx counts") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ/FASTA") p.add_option("--fasta", default=False, action="store_true", help="Input is FASTA instead of FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \ "*.fq,*.fq.gz,*.fastq,*.fastq.gz" mm = MakeManager() for p, pf in iter_project(folder, pattern=pattern, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print("\n".join(p), file=fw) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) if opts.cx: cmd += " -cx{}".format(opts.cx) if opts.fasta: cmd += " -fm" cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def tophat(args): """ %prog tophat folder reference Run tophat on a folder of reads. """ from jcvi.apps.bowtie import check_index from jcvi.formats.fastq import guessoffset p = OptionParser(tophat.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.add_option("--intron", default=15000, type="int", help="Max intron size [default: %default]") p.add_option("--dist", default=-50, type="int", help="Mate inner distance [default: %default]") p.add_option("--stdev", default=50, type="int", help="Mate standard deviation [default: %default]") p.set_phred() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) num = 1 if opts.single else 2 folder, reference = args reference = check_index(reference) for p, prefix in iter_project(folder, n=num): outdir = "{0}_tophat".format(prefix) outfile = op.join(outdir, "accepted_hits.bam") if op.exists(outfile): logging.debug("File `{0}` found. Skipping.".format(outfile)) continue cmd = "tophat -p {0}".format(opts.cpus) if opts.gtf: cmd += " -G {0}".format(opts.gtf) cmd += " -o {0}".format(outdir) if num == 1: # Single-end a, = p else: # Paired-end a, b = p cmd += " --max-intron-length {0}".format(opts.intron) cmd += " --mate-inner-dist {0}".format(opts.dist) cmd += " --mate-std-dev {0}".format(opts.stdev) phred = opts.phred or str(guessoffset([a])) if phred == "64": cmd += " --phred64-quals" cmd += " {0} {1}".format(reference, " ".join(p)) sh(cmd)
def star(args): """ %prog star folder reference Run star on a folder with reads. """ p = OptionParser(star.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus mm = MakeManager() num = 1 if opts.single else 2 folder, reference = args gd = "GenomeDir" mkdir(gd) STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd) # Step 0: build genome index genomeidx = op.join(gd, "Genome") if need_update(reference, genomeidx): cmd = STAR + " --runMode genomeGenerate" cmd += " --genomeFastaFiles {0}".format(reference) mm.add(reference, genomeidx, cmd) # Step 1: align for p, prefix in iter_project(folder, opts.names, num): pf = "{0}_star".format(prefix) bamfile = pf + "Aligned.sortedByCoord.out.bam" cmd = STAR + " --readFilesIn {0}".format(" ".join(p)) if p[0].endswith(".gz"): cmd += " --readFilesCommand zcat" cmd += " --outSAMtype BAM SortedByCoordinate" cmd += " --outFileNamePrefix {0}".format(pf) cmd += " --twopassMode Basic" # Compatibility for cufflinks cmd += " --outSAMstrandField intronMotif" cmd += " --outFilterIntronMotifs RemoveNoncanonical" mm.add(p, bamfile, cmd) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Exclude kmers with less than ci counts") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--cx", default=None, type="int", help="Exclude kmers with more than cx counts") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ/FASTA") p.add_option("--fasta", default=False, action="store_true", help="Input is FASTA instead of FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \ "*.fq,*.fq.gz,*.fastq,*.fastq.gz" mm = MakeManager() for p, pf in iter_project(folder, pattern=pattern, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) if opts.cx: cmd += " -cx{}".format(opts.cx) if opts.fasta: cmd += " -fm" cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def batch(args): """ %proj batch database.fasta project_dir output_dir Run bwa in batch mode. """ p = OptionParser(batch.__doc__) set_align_options(p) p.set_sam_options() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref_fasta, proj_dir, outdir = args outdir = outdir.rstrip("/") s3dir = None if outdir.startswith("s3://"): s3dir = outdir outdir = op.basename(outdir) mkdir(outdir) mm = MakeManager() for p, pf in iter_project(proj_dir): targs = [ref_fasta] + p cmd1, bamfile = mem(targs, opts) if cmd1: cmd1 = output_bam(cmd1, bamfile) nbamfile = op.join(outdir, bamfile) cmd2 = "mv {} {}".format(bamfile, nbamfile) cmds = [cmd1, cmd2] if s3dir: cmd = "aws s3 cp {} {} --sse".format(nbamfile, op.join(s3dir, bamfile)) cmds.append(cmd) mm.add(p, nbamfile, cmds) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Minimum value of a counter") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 mm = MakeManager() for p, pf in iter_project(folder, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def meryl(args): """ %prog meryl folder Run meryl on Illumina reads. """ p = OptionParser(meryl.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (folder, ) = args K = opts.k cpus = opts.cpus mm = MakeManager() for p, pf in iter_project(folder): cmds = [] mss = [] for i, ip in enumerate(p): ms = "{}{}.ms{}".format(pf, i + 1, K) mss.append(ms) cmd = "meryl -B -C -m {} -threads {}".format(K, cpus) cmd += " -s {} -o {}".format(ip, ms) cmds.append(cmd) ams, bms = mss pms = "{}.ms{}".format(pf, K) cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms) cmds.append(cmd) cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format( ams, ams, bms, bms) cmds.append(cmd) mm.add(p, pms + ".mcdat", cmds) mm.write()
def meryl(args): """ %prog meryl folder Run meryl on Illumina reads. """ p = OptionParser(meryl.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k cpus = opts.cpus mm = MakeManager() for p, pf in iter_project(folder): cmds = [] mss = [] for i, ip in enumerate(p): ms = "{}{}.ms{}".format(pf, i + 1, K) mss.append(ms) cmd = "meryl -B -C -m {} -threads {}".format(K, cpus) cmd += " -s {} -o {}".format(ip, ms) cmds.append(cmd) ams, bms = mss pms = "{}.ms{}".format(pf, K) cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms) cmds.append(cmd) cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".\ format(ams, ams, bms, bms) cmds.append(cmd) mm.add(p, pms + ".mcdat", cmds) mm.write()