def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. The FASTQ files in the folder will automatically pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option( "--mapped", default=False, action="store_true", help="Retain contaminated reads instead", ) p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder): align_opts = [ecoli] + p + [tag] align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"] if opts.mateorientation: align_opts += [ "--mateorientation={0}".format(opts.mateorientation) ] align(align_opts)
def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. The FASTQ files in the folder will automatically pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option("--mapped", default=False, action="store_true", help="Retain contaminated reads instead [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder, 2): align_opts = [ecoli] + p + [tag] align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"] if opts.mateorientation: align_opts += ["--mateorientation={0}".format(opts.mateorientation)] samfile, logfile = align(align_opts)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def contamination(args): """ %prog contamination Ecoli.fasta genome.fasta read.fastq Check read contamination on a folder of paired reads. Use bowtie2 to compare the reads against: 1. Ecoli.fsata - this will tell us the lower bound of contamination 2. genome.fasta - this will tell us the upper bound of contamination """ from jcvi.apps.bowtie import BowtieLogFile, align p = OptionParser(contamination.__doc__) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ecoli, genome, fq = args firstN_opt = "--firstN={0}".format(opts.firstN) samfile, logfile = align([ecoli, fq, firstN_opt]) bl = BowtieLogFile(logfile) lowerbound = bl.rate samfile, logfile = align([genome, fq, firstN_opt]) bl = BowtieLogFile(logfile) upperbound = 100 - bl.rate median = (lowerbound + upperbound) / 2 clogfile = fq + ".Ecoli" fw = open(clogfile, "w") lowerbound = "{0:.1f}".format(lowerbound) upperbound = "{0:.1f}".format(upperbound) median = "{0:.1f}".format(median) print >> fw, "\t".join((fq, lowerbound, median, upperbound)) print >> sys.stderr, "{0}: Ecoli contamination rate {1}-{2}".\ format(fq, lowerbound, upperbound) fw.close()
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta