def merger(args): """ %prog merger layout gkpStore contigs.fasta Merge reads into one contig. """ p = OptionParser(merger.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) layout, gkpstore, contigs = args fp = open(layout) pf = "0" iidfile = pf + ".iids" for i, row in enumerate(fp): logging.debug("Read unitig {0}".format(i)) fw = open(iidfile, "w") layout = row.split("|") print("\n".join(layout), file=fw) fw.close() cmd = "gatekeeper -iid {0}.iids -dumpfasta {0} {1}".format( pf, gkpstore) sh(cmd) fastafile = "{0}.fasta".format(pf) newfastafile = "{0}.new.fasta".format(pf) format([ fastafile, newfastafile, "--sequential=replace", "--sequentialoffset=1", "--nodesc", ]) fasta([newfastafile]) sh("rm -rf {0}".format(pf)) cmd = "runCA {0}.frg -p {0} -d {0} consensus=pbutgcns".format(pf) cmd += " unitigger=bogart doFragmentCorrection=0 doUnitigSplitting=0" sh(cmd) outdir = "{0}/9-terminator".format(pf) cmd = "cat {0}/{1}.ctg.fasta {0}/{1}.deg.fasta {0}/{1}.singleton.fasta".format( outdir, pf) sh(cmd, outfile=contigs, append=True)
def build(args): """ %prog build current.fasta Bacteria_Virus.fasta prefix Build assembly files after a set of clean-ups: 1. Use cdhit (100%) to remove duplicate scaffolds 2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov) 3. Mask matches to UniVec_Core 4. Sort by decreasing scaffold sizes 5. Rename the scaffolds sequentially 6. Build the contigs by splitting scaffolds at gaps 7. Rename the contigs sequentially """ from jcvi.apps.cdhit import deduplicate from jcvi.apps.vecscreen import mask from jcvi.formats.fasta import sort p = OptionParser(build.__doc__) p.add_option( "--nodedup", default=False, action="store_true", help="Do not deduplicate [default: deduplicate]", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, bacteria, pf = args dd = deduplicate([fastafile, "--pctid=100" ]) if not opts.nodedup else fastafile screenfasta = screen([dd, bacteria]) tidyfasta = mask([screenfasta]) sortedfasta = sort([tidyfasta, "--sizes"]) scaffoldfasta = pf + ".assembly.fasta" format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"]) gapsplitfasta = pf + ".gapSplit.fasta" cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta) sh(cmd) contigsfasta = pf + ".contigs.fasta" format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
def merger(args): """ %prog merger layout gkpStore contigs.fasta Merge reads into one contig. """ p = OptionParser(merger.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) layout, gkpstore, contigs = args fp = open(layout) pf = "0" iidfile = pf + ".iids" for i, row in enumerate(fp): logging.debug("Read unitig {0}".format(i)) fw = open(iidfile, "w") layout = row.split("|") print >> fw, "\n".join(layout) fw.close() cmd = "gatekeeper -iid {0}.iids -dumpfasta {0} {1}".format(pf, gkpstore) sh(cmd) fastafile = "{0}.fasta".format(pf) newfastafile = "{0}.new.fasta".format(pf) format([fastafile, newfastafile, "--sequential=replace", \ "--sequentialoffset=1", "--nodesc"]) fasta([newfastafile]) sh("rm -rf {0}".format(pf)) cmd = "runCA {0}.frg -p {0} -d {0} consensus=pbutgcns".format(pf) cmd += " unitigger=bogart doFragmentCorrection=0 doUnitigSplitting=0" sh(cmd) outdir = "{0}/9-terminator".format(pf) cmd = "cat {0}/{1}.ctg.fasta {0}/{1}.deg.fasta {0}/{1}.singleton.fasta"\ .format(outdir, pf) sh(cmd, outfile=contigs, append=True)
def build(args): """ %prog build current.fasta Bacteria_Virus.fasta prefix Build assembly files after a set of clean-ups: 1. Use cdhit (100%) to remove duplicate scaffolds 2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov) 3. Mask matches to UniVec_Core 4. Sort by decreasing scaffold sizes 5. Rename the scaffolds sequentially 6. Build the contigs by splitting scaffolds at gaps 7. Rename the contigs sequentially """ from jcvi.apps.cdhit import deduplicate from jcvi.apps.vecscreen import mask from jcvi.formats.fasta import sort p = OptionParser(build.__doc__) p.add_option("--nodedup", default=False, action="store_true", help="Do not deduplicate [default: deduplicate]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, bacteria, pf = args dd = deduplicate([fastafile, "--pctid=100"]) \ if not opts.nodedup else fastafile screenfasta = screen([dd, bacteria]) tidyfasta = mask([screenfasta]) sortedfasta = sort([tidyfasta, "--sizes"]) scaffoldfasta = pf + ".assembly.fasta" format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"]) gapsplitfasta = pf + ".gapSplit.fasta" cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta) sh(cmd) contigsfasta = pf + ".contigs.fasta" format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([ fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid) ]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit", default="/usr/local/bin/") p.set_home("fiona", default="/usr/local/bin/") p.set_home("jellyfish", default="/usr/local/bin/") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus), "--jellyfish_home={0}".format(opts.jellyfish_home)]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home)]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5)]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf)])
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append( 'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid)]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit") p.set_home("fiona") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([ diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus) ]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "bin/fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([ fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home) ]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([ cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5) ]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([ filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf) ])
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents, meta="run script")