def convert(args): """ %prog convert in.fastq out.fastq illumina fastq quality encoding uses offset 64, and sanger uses 33. This script creates a new file with the correct encoding """ p = OptionParser(convert.__doc__) p.set_phred() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) infastq, outfastq = args phred = opts.phred or str(guessoffset([infastq])) fin = "illumina" if phred == "64" else "sanger" fout = "sanger" if phred == "64" else "illumina" seqret = "seqret" if infastq.endswith(".gz"): cmd = "zcat {0} | ".format(infastq) cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".format(fin, fout) else: cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".format(fin, infastq, fout) sh(cmd, outfile=outfastq) return outfastq
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads?", ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug( "[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in ( 1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def tophat(args): """ %prog tophat folder reference Run tophat on a folder of reads. """ from jcvi.apps.bowtie import check_index from jcvi.formats.fastq import guessoffset p = OptionParser(tophat.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.add_option("--intron", default=15000, type="int", help="Max intron size [default: %default]") p.add_option("--dist", default=-50, type="int", help="Mate inner distance [default: %default]") p.add_option("--stdev", default=50, type="int", help="Mate standard deviation [default: %default]") p.set_phred() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) num = 1 if opts.single else 2 folder, reference = args reference = check_index(reference) for p, prefix in iter_project(folder, n=num): outdir = "{0}_tophat".format(prefix) outfile = op.join(outdir, "accepted_hits.bam") if op.exists(outfile): logging.debug("File `{0}` found. Skipping.".format(outfile)) continue cmd = "tophat -p {0}".format(opts.cpus) if opts.gtf: cmd += " -G {0}".format(opts.gtf) cmd += " -o {0}".format(outdir) if num == 1: # Single-end a, = p else: # Paired-end a, b = p cmd += " --max-intron-length {0}".format(opts.intron) cmd += " --mate-inner-dist {0}".format(opts.dist) cmd += " --mate-std-dev {0}".format(opts.stdev) phred = opts.phred or str(guessoffset([a])) if phred == "64": cmd += " --phred64-quals" cmd += " {0} {1}".format(reference, " ".join(p)) sh(cmd)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]" ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug("[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def convert(args): """ %prog convert in.fastq illumina fastq quality encoding uses offset 64, and sanger uses 33. This script creates a new file with the correct encoding. Output gzipped file if input is also gzipped. """ p = OptionParser(convert.__doc__) p.set_phred() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) infastq, = args phred = opts.phred or str(guessoffset([infastq])) ophred = {"64": "33", "33": "64"}[phred] gz = infastq.endswith(".gz") outfastq = infastq.rsplit(".", 1)[0] if gz else infastq pf, sf = outfastq.rsplit(".", 1) outfastq = "{0}.q{1}.{2}".format(pf, ophred, sf) if gz: outfastq += ".gz" fin = "illumina" if phred == "64" else "sanger" fout = "sanger" if phred == "64" else "illumina" seqret = "seqret" if infastq.endswith(".gz"): cmd = "zcat {0} | ".format(infastq) cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\ format(fin, fout) else: cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".\ format(fin, infastq, fout) sh(cmd, outfile=outfastq) return outfastq
def gatk(args): """ %prog gatk bamfile reference.fasta Call SNPs based on GATK best practices. """ p = OptionParser(gatk.__doc__) p.add_option("--indelrealign", default=False, action="store_true", help="Perform indel realignment") p.set_home("gatk") p.set_home("picard") p.set_phred() p.set_cpus(cpus=24) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, ref = args pf = bamfile.rsplit(".", 1)[0] mm = MakeManager() picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home) tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home) tk += " -R {0}".format(ref) # Step 0 - build reference dictfile = ref.rsplit(".", 1)[0] + ".dict" cmd1 = picard + " CreateSequenceDictionary" cmd1 += " R={0} O={1}".format(ref, dictfile) cmd2 = "samtools faidx {0}".format(ref) mm.add(ref, dictfile, (cmd1, cmd2)) # Step 1 - sort bam sortedbamfile = pf + ".sorted.bam" cmd = picard + " SortSam" cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile) cmd += " SORT_ORDER=coordinate CREATE_INDEX=true" mm.add(bamfile, sortedbamfile, cmd) # Step 2 - mark duplicates dedupbamfile = pf + ".dedup.bam" cmd = picard + " MarkDuplicates" cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile) cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true" mm.add(sortedbamfile, dedupbamfile, cmd) if opts.indelrealign: # Step 3 - create indel realignment targets intervals = pf + ".intervals" cmd = tk + " -T RealignerTargetCreator" cmd += " -I {0} -o {1}".format(dedupbamfile, intervals) mm.add(dedupbamfile, intervals, cmd) # Step 4 - indel realignment realignedbamfile = pf + ".realigned.bam" cmd = tk + " -T IndelRealigner" cmd += " -targetIntervals {0}".format(intervals) cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile) mm.add((dictfile, intervals), realignedbamfile, cmd) else: realignedbamfile = dedupbamfile # Step 5 - SNP calling vcf = pf + ".vcf" cmd = tk + " -T HaplotypeCaller" cmd += " -I {0}".format(realignedbamfile) cmd += " --genotyping_mode DISCOVERY" cmd += " -stand_emit_conf 10 -stand_call_conf 30" cmd += " -nct {0}".format(opts.cpus) cmd += " -o {0}".format(vcf) if opts.phred == "64": cmd += " --fix_misencoded_quality_scores" mm.add(realignedbamfile, vcf, cmd) # Step 6 - SNP filtering filtered_vcf = pf + ".filtered.vcf" cmd = tk + " -T VariantFiltration" cmd += " -V {0}".format(vcf) cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"' cmd += ' --filterName "LOWQUAL"' cmd += ' --genotypeFilterExpression "isHomVar == 1"' cmd += ' --genotypeFilterName "HOMOVAR"' cmd += ' --genotypeFilterExpression "isHet == 1"' cmd += ' --genotypeFilterName "HET"' cmd += " -o {0}".format(filtered_vcf) mm.add(vcf, filtered_vcf, cmd) mm.write()
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ tv = "0.32" TrimJar = "trimmomatic-{0}.jar".format(tv) p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic jar file [default: %default]") p.set_phred() p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=15, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=36, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--adapteronly", default=False, action="store_true", help="Only trim adapters with no qv trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") p.add_option("--log", default=None, dest="trimlog", help="Specify a `trimlog` file [default: %default]") p.set_cpus(cpus=4) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path), \ "Couldn't find Trimmomatic jar file at `{0}`".\ format(path) adaptersfile = "adapters.fasta" Adapters = must_open(op.join(datadir, adaptersfile)).read() write_file(adaptersfile, Adapters, skipcheck=True) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) threadsflag = " -threads {0}".format(opts.cpus) if opts.trimlog: trimlog = " -trimlog {0}".format(opts.trimlog) cmd = "java -Xmx4g -jar {0}".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] get_dirname = lambda x: "{0}/".format(op.dirname(x)) if op.dirname(x) else '' if len(args) == 1: cmd += " SE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile, = args prefix = get_prefix(fastqfile) dirname = get_dirname(fastqfile) frags1 = dirname + prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += " PE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) dirname1 = get_dirname(fastqfile1) prefix2 = get_prefix(fastqfile2) dirname2 = get_dirname(fastqfile2) pairs1 = dirname1 + prefix1 + pairs pairs2 = dirname2 + prefix2 + pairs frags1 = dirname1 + prefix1 + frags frags2 = dirname2 + prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile) if not opts.adapteronly: cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv) cmd += " MINLEN:{0}".format(opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd)