def DepthOfCoverage(): """Run DepthOfCoverage. Options: INPUT_LIST List of input files OUTPUT Output file name base bam glob of bam files opts command line options to pass """ options.order("DepthOfCoverage", "gatk_default") output = options.get("OUTPUT", "depthofcoverage") bamfiles = glob.glob(options.get("bam", "")) if len(bamfiles) > 0: with open("bamfiles.list", "w") as out_handle: for bf in bamfiles: out_handle.write(os.path.abspath(bf) + "\n") input_list = options.get("INPUT_LIST", "bamfiles.list") javamem = options.get("javamem") opts = options.get("opts", "") gatk_home = options.get("gatk_home") cl = [" ".join(["java -jar", javamem, path(gatk_home) / "GenomeAnalysisTK.jar", "-T", "DepthOfCoverage", "-I", str(input_list), "-o", str(output), "-R", options.get("REF"), "-L", options.get("TARGET"), str(opts)])] if os.path.exists(input_list): run_cmd(cl, None, None, options.run, "Running DepthOfCoverage")
def pbzip2(options, info): """Run pbzip2 on a bunch of files. Options (set in sys.pbzip2 section by default). basedir directory to work in. Default: os.path.curdir pattern file glob to look for under basedir. Default: None opts command line options to pass to pbzip2. Default: -v decompress decompress file. Default: False """ options.order('pbzip2') basedir = options.get('basedir', path(os.path.curdir)) opts = options.get('opts', "-v") decompress = options.get('decompress', False) pattern = options.get('glob', None) if decompress: opts = opts + "d" if not pattern is None: files = basedir.walkfiles(pattern) cl = [" ".join(['pbzip2', opts, pattern])] if files: run_cmd(cl, files.next(), None, options.run, "Running pbzip2")
def GATK(): """Run GATK program. Options: INPUT Most (all?) gatk programs require an infile OUTPUT Output file name program GATK program to run (-T option) opts command line options to pass to GATK and the program type. gatk_home location of gatk """ options.order("GATK", "gatk_default") INPUT = options.get("INPUT", "") OUTPUT = options.get("OUTPUT", "") program = options.get("program", "") javamem = options.get("javamem") opts = options.get("opts", "") gatk_home = options.get("gatk_home") cl = [" ".join(["java -jar", javamem, path(gatk_home) / "GenomeAnalysisTK.jar", "-T", program, "INPUT=" + str(INPUT), "OUTPUT=" + str(OUTPUT), str(opts)])] if INPUT: run_cmd(cl, None, None, options.run, "Running gatk program %s" % program)
def annovar_convert_to_annovar(): """Run annovar convert to annovar.""" options.order("annovar_convert_to_annovar") query = options.get("INPUT", None) fformat = options.get("FORMAT", "vcf4") opts = options.get("opts", "") if not query is None: outfile = options.get("OUTPUT", query + ".avinput") cl = [" ".join([path(options.get("annovar_home"))/ "convert2annovar.pl", opts, "--format %s %s | sed -e \"s/^[ \t]*//g\" > %s" % (fformat, query, outfile)])] run_cmd(cl, query, None, options.get("run"), msg="Running convert2annovar.pl")
def annovar_summarize_annovar(): """Run annovar summarize annovar.""" options.order("annovar_summarize_annovar") query = options.get("INPUT", None) database = options.get("DB", path(options.get("annovar_home")) / "humandb") opts = options.get("opts", "") if not query is None: outfile = options.get("OUTPUT", query + ".summarize_annovar") cl = [" ".join([path(options.get("annovar_home"))/ "summarize_annovar.pl", opts, "--outfile %s" % (outfile), query, database])] run_cmd(cl, query, None, options.get("run"), msg="Running summarize_annovar.pl")
def annovar_annotate_variation(): """Run annovar annotate variation.""" options.order("annovar_annotate_variation") query = options.get("INPUT", None) database = options.get("DB", path(options.get("annovar_home")) / "humandb") opts = options.get("opts", None) if not query is None: outfile = options.get("OUTPUT", query + "annotate_variation") cl = [" ".join([path(options.get("annovar_home"))/ "annotate_variation.pl", opts, "--outfile %s" % (outfile), query, database])] run_cmd(cl, query, None, options.get("run"), msg="Running annovar annotate variation")
def crisp_to_vcf(): """Convert crisp to vcf""" options.order("crisp_to_vcf") default = options.variantcalling_default infile = options.get("INPUT", None) if not infile is None: outfile = infile + ".vcf" opts = options.get("opts", "") cl = [" ".join(["%s" % path(options.get("crispr_home"))/ "crisp_to_vcf.py", infile, ">", outfile])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running crisp_to_vcf")
def sam2bam(): """Run samtools view.""" options.order("sam2bam") default = options.samtools_default samfile = options.get("INPUT", options.get("prefix", None)) if samfile is None: return opts = options.get("opts", default.sam2bam.get("opts")) outfile = samfile.replace(".sam", ".bam") cl = [" ".join(["samtools view", opts, samfile, ">", outfile])] run_cmd(cl, samfile, outfile, options.run, "Running %s" % cl)
def multiBamCov(): """multiBamCov wrapper""" options.order("multiBamCov", "bedtools_default") bams = options.get("bams", None) bamfiles = glob.glob(bams) bed = options.get("bed", None) if bamfiles is None or bed is None: return outfile = options.get("outfile", os.path.splitext(bamfiles[0])[0] + "-multiBamCov.txt") opts = options.get("opts", "") cl = [" ".join([path(options.get("bedtools_home"))/ "multiBamCov", "-bams", os.path.abspath(os.path.join(options.get("workdir", "./"), bams)), "-bed", bed, opts])] run_cmd(cl, bamfiles[0], outfile, options.get("run"), None)
def samsort(): """Run samtools sort.""" options.order("samsort") default = options.samtools_default infile = options.get("INPUT", options.get("prefix", None)) if infile is None: return prefix, ext = os.path.splitext(infile) opts = options.get("opts", default.samsort.get("opts")) outfile = infile.replace(ext, "-sort") cl = [" ".join(["samtools sort", opts, infile, prefix + "-sort"])] run_cmd(cl, infile, None, options.run, "Running %s" % cl)
def bam2bam(): """Run samtools view on bam file.""" options.order("bam2bam") default = options.samtools_default bamfile = options.get("INPUT", options.get("prefix", None)) if bamfile is None: return opts = options.get("opts", default.bam2bam.get("opts")) ext_out = options.get("ext_out", "test") outfile = bamfile.replace(".bam", ext_out) cl = [" ".join(["samtools view", opts, bamfile, ">", outfile])] run_cmd(cl, bamfile, outfile, options.run, "Running %s" % cl)
def MergeBamAlignment(): """Run MergeBamAlignment""" options.order("MergeBamAlignment", "picard_default") unmapped_bam = options.get("UNMAPPED_BAM", options.prefix) output = options.get("OUTPUT", get_prefix(unmapped_bam)[0] + ".bam") ref = options.get("REFERENCE_SEQUENCE", options.index_loc.get(options.aligner).get(options.ref)[2]) opts = options.get("opts", "") if not unmapped_bam is None: opts += " OUTPUT=%s UNMAPPED_BAM=%s REFERENCE_SEQUENCE=%s" % (output, unmapped_bam, ref) cl = [" ".join(["java -jar", options.get("javamem"), path(options.get("picard_home")) / "MergeBamAlignment.jar", opts])] run_cmd(cl, unmapped_bam, output, options.get("run"), msg="Running MergeBamAlignment") else: print >> sys.stderr, "required argument unmapped_bam missing"
def automated_initial_analysis(): """Run automated_initial_analysis. INPUT: flowcell to run pipeline on""" options.order("automated_initial_analysis") INPUT = options.get("INPUT", None) work_dir = options.get("workdir", "") if not INPUT is None: yaml_config = options.get("yaml_config", "post_process.yaml") run_info = options.get("run_info", "run_info.yaml") cl = [" ".join(["automated_initial_analysis.py", yaml_config, INPUT, run_info, work_dir])] run_cmd(cl, INPUT, None, True, "running automated_initial_analysis")
def BuildBamIndex(): """Build bam index""" options.order("BuildBamIndex", "picard_default") infile = os.path.abspath(options.get("INPUT", None)) outfile = os.path.abspath(options.get("OUTPUT", infile.rstrip(".bam") + ".bai")) validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") opts = options.get("opts", "") if not infile is None: opts += " INPUT=%s OUTPUT=%s VALIDATION_STRINGENCY=%s" % (infile, outfile, validation_stringency) cl = [" ".join(["java -jar", options.get("javamem"), path(options.get("picard_home")) / "BuildBamIndex.jar", opts ])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running BuildBamIndex") else: print >> sys.stderr, "required argument missing"
def align(): """Run bwa aln bwa.opts options.ref infile > outfile""" options.order("align", "bwa_default", add_rest=True) msg = "Running paver.ngs.tools.bwa.align" prefix = options.align.get("prefix", options.get("prefix")) if prefix is None: return prefix, ext = os.path.splitext(prefix) if not ext: ext = options.align.get("ext_in", options.ext_fq) infile = prefix + ext outfile = prefix + options.get("ext_out", options.get("aln")["ext_out"]) cl = [" ".join([options.get("program"), "aln", options.get("aln")["opts"], options.index_loc["bwa"][options.ref][2], infile, ">", outfile])] run_cmd(cl, infile, outfile, options.run, msg)
def FastqToSam(): """Write unaligned bam file""" options.order("FastqToSam", "picard_default") f1 = options.get("FASTQ", current_prefix(options.read_suffix + options.ext_fq)) f2 = options.get("FASTQ2",read2()) qv = options.get("QUALITY_FORMAT", "Standard") output = options.get("OUTPUT", None) sample_name = options.get("SAMPLE_NAME", get_prefix(f1)[0]) opts = options.get("opts", "") if not output is None and not sample_name is None: opts += " FASTQ=%s FASTQ2=%s QUALITY_FORMAT=%s SAMPLE_NAME=%s OUTPUT=%s" % (f1, f2, qv, sample_name, output) cl = [" ".join(["java -jar", options.get("javamem"), path(options.get("picard_home")) / "FastqToSam.jar", opts])] run_cmd(cl, f1, output, options.get("run"), msg="Running FastqToSam") else: print >> sys.stderr, "required argument missing"
def CalculateHsMetrics(): """Calculate hsmetrics.""" options.order("CalculateHsMetrics") default = options.picard_default infile = options.get("INPUT", None) targets = options.get("TI", None) validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") if not infile is None and not targets is None: prefix, ext = os.path.splitext(infile) outfile = options.get("OUTPUT", prefix + ".hs_metrics") baits = options.get("BI", targets) opts = options.get("opts", "") opts += " INPUT=%s OUTPUT=%s BI=%s TI=%s VALIDATION_STRINGENCY=%s" % (infile, outfile, baits, targets, validation_stringency) cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("picard_home", default.get("picard_home"))) / "CalculateHsMetrics.jar", opts])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running CollectAlignmentSummaryMetrics") else: print >> sys.stderr, "required argument infile missing"
def sampe(options): """Run bwa sampe. Takes as input a fastq file or a prefix.""" msg = "Running paver.ngs.tools.bwa.sampe" options.order("sampe", "bwa_default") prefix = options.sampe.get("prefix", options.get("prefix")) if prefix is None: return prefix, ext = os.path.splitext(prefix) if not ext: ext = options.sampe.get("ext_in", options.ext_fq) fastq1 = prefix + options.get("read1_suffix") + ext fastq2 = prefix + options.get("read2_suffix") + ext saifile1 = prefix + options.get("read1_suffix") + options.get("aln")["ext_out"] saifile2 = prefix + options.get("read2_suffix") + options.get("aln")["ext_out"] out = prefix + options.get("sampe")["ext_out"] cl = " ".join([options.get("program"), "sampe", options.get("sampe")["opts"], options.index_loc["bwa"][options.ref][2], saifile1, saifile2, fastq1, fastq2, ">", out]) run_cmd(cl, saifile1, out)
def SortSam(): """Sort sam/bam file""" options.order("SortSam") default = options.picard_default infile = options.SortSam.get("INPUT", options.get("current_file", None)) if infile is None: print >> sys.stderr, "required argument missing" return prefix, ext = os.path.splitext(infile) outfile = options.get("OUTPUT", prefix + "-sort" + ext) options.current_file = outfile sort_order = options.get("SORT_ORDER", "coordinate") validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") opts = options.get("opts", "") opts += " INPUT=%s OUTPUT=%s SORT_ORDER=%s VALIDATION_STRINGENCY=%s" % (infile, outfile, sort_order, validation_stringency) cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("picard_home", default.get("picard_home"))) / "SortSam.jar", opts ])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running SortSam")
def ngsrich_evaluate(): """Run NGSrich.""" options.order("ngsrich_evaluate") readfile = options.get("INPUT", None) refgenefile = options.get("REFGENEFILE", "refGene.txt") if readfile is None: return prefix, ext = os.path.splitext(readfile) tmpdir = options.get("TMPDIR", os.path.join("/scratch/tmp/%s" % os.path.basename(prefix))) enrichdir = os.path.join(os.path.dirname(prefix), "enrichment/" + os.path.basename(prefix)) if not os.path.exists(tmpdir): os.makedirs(tmpdir) if not os.path.exists(enrichdir): os.makedirs(enrichdir) cl = [" ".join(["java NGSrich", "evaluate -r %s" % readfile, "-u %s -t %s -o %s" % (refgenefile, options.teqc.target, enrichdir), "-T %s" % tmpdir] )] run_cmd(cl, readfile, None, options.run, "Running NGSrich on %s" % readfile) sh("rm -rf %s" % tmpdir)
def coverageBed(): """coverageBed wrapper""" options.order("coverageBed", "bedtools_default") a = options.get("a", None) b = options.get("b", None) abam = options.get("abam", None) d = options.get("d", False) output = options.get("output", "/dev/stdout") aopt = "-a" if not abam is None: aopt = "-abam" a = abam opts = options.get("opts", "") if d: opts += "-d" if not a is None: cl = [" ".join(["coverageBed", opts, aopt, a, "-b", b, ">", output])] run_cmd(cl, a, output, options.run, None)
def map_reads(): """Collects bwa functions for mapping. Runs aligner and samse/sampe""" options.run = False options.align = Bunch() if options.paired_end: environment.call_task("ngs.paver.tools.bwa.align") environment.call_task("ngs.paver.tools.bwa.align") environment.call_task("ngs.paver.tools.bwa.sampe") options.run = True bwa["cl"] = run_cmd(bwa["cl"])
def teqc(): """Run teqc_enrichment.py script. Uses pybedtools to do the magic. """ options.order("teqc", add_rest=True) bamfile = options.get("abam", None) targetfile = options.get("target", None) ucscgenome = options.get("ucscgenome", None) flank = options.get("flank", None) opts = "" if not ucscgenome is None: opts += "--build=%s" % ucscgenome if not flank is None: opts += "--flank=%s" % flank if bamfile is None or targetfile is None: print >> sys.stderr, "missing arguments" cl = [" ".join(["teqc_enrichment.py", bamfile, targetfile, opts])] run_cmd(cl, bamfile, None, options.run, None)
def CollectAlignmentSummaryMetrics(): """Collect alignment summary metrics.""" options.order("CollectAlignmentSummaryMetrics") default = options.picard_default infile = options.get("INPUT", None) validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") if not infile is None: prefix, ext = os.path.splitext(infile) outfile = options.get("OUTPUT", prefix + ".align_metrics") if not options.ref: ref = "null" else: ref = options.get("REFERENCE_SEQUENCE", options.index_loc["sam_fa"][options.ref][2]) opts = options.get("opts", "") opts += " INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s VALIDATION_STRINGENCY=%s" % (infile, outfile, ref, validation_stringency) cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("picard_home", default.get("picard_home"))) / "CollectAlignmentSummaryMetrics.jar", opts])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running CollectAlignmentSummaryMetrics") else: print >> sys.stderr, "required argument infile missing"
def samse(): """Run bwa samse""" prefix, ext = os.path.splitext(options.prefix) if not ext: ext = options.ext_fq saifile = prefix + options.read_suffix + bwa["aln"]["ext_out"] fastq = prefix + options.read_suffix + options.ext_fq out = options.prefix + bwa["samse"]["ext_out"] bwa["cl"].append(" ".join([bwa["program"], "samse", bwa["samse"]["opts"], options.index_loc["bwa"][options.ref][2], saifile, fastq, ">", out])) bwa["cl"] = run_cmd(bwa["cl"], saifile, out)
def AddOrReplaceReadGroups(): """Add or replace read groups. """ options.order("AddOrReplaceReadGroups") default = options.picard_default infile = options.AddOrReplaceReadGroups.get("INPUT", options.get("current_file", None)) if infile is None: print >> sys.stderr, "required argument missing" return prefix, ext = os.path.splitext(infile) outfile = options.get("OUTPUT", prefix + "-dup" + ext) options.current_file = outfile metrics = options.get("METRICS", prefix + "-dup.metrics") validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") remove_duplicates = options.get("REMOVE_DUPLICATES", True) opts = options.get("opts", "") opts += " INPUT=%s OUTPUT=%s METRICS_FILE=%s VALIDATION_STRINGENCY=%s REMOVE_DUPLICATES=%s" % (infile, outfile, metrics, validation_stringency, remove_duplicates) cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("picard_home", default.get("picard_home"))) / "MarkDuplicates.jar", opts ])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running MarkDuplicates")
def mpileup(): """Run samtools mpileup. Options: INPUT Input file outfile output file reference reference sequence """ options.order("mpileup") INPUT = os.path.abspath(options.get("INPUT", None)) ref = options.get("reference", options.index_loc["sam_fa"][options.ref][2]) opts = options.get("opts", "") if not INPUT is None: outfile = options.get("outfile", os.path.abspath(os.path.splitext(INPUT)[0] + ".mpileup")) cl = [" ".join(["samtools mpileup", str(opts), "-f", ref, INPUT, ">", outfile])] run_cmd(cl, INPUT, outfile, options.run, "running samtools mpileup")
def MergeSamFiles(): """Merge sam/bam files.""" options.order("MergeSamFiles") default = options.picard_default infile = options.get("INPUT", None) validation_stringency = options.get("VALIDATION_STRINGENCY", "SILENT") if not infile is None: if len(infile) < 2: print >> sys.stderr, "len(infile) < 2: need at least two files for merging" sys.exit() prefix, ext = os.path.splitext(infile[0]) outfile = options.get("OUTPUT", prefix + "-merge.bam") opts = options.get("opts", "") opts += " OUTPUT=%s VALIDATION_STRINGENCY=%s" % (outfile, validation_stringency) for f in infile: opts += " INFILE=%s" % f cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("picard_home", default.get("picard_home"))) / "MergeSamFiles.jar", opts])] run_cmd(cl, infile, outfile, options.get("run"), msg="Running MergeSamFiles") else: print >> sys.stderr, "required argument infile missing"
def muTect_paired(): """Run muTect for paired normal/tumour sample.""" options.order("muTect_paired") default = options.mutect_default normal = options.get("INPUT", None) tumour = options.get("INPUT2", None) dbsnp = options.get("dbsnp", "") cosmic = options.get("cosmic", "") if not normal is None and not tumour is None: OUTPUT = options.get("OUTPUT", "mutect_paired.call_stats.txt") coverage = options.get("COVERAGE_FILE", "mutect_paired.coverage.wig") VALIDATION_STRICTNESS = options.get("VALIDATION_STRICTNESS", default.get("VALIDATION_STRICTNESS")) ref = options.get("reference", options.index_loc["sam_fa"][options.ref][2]) opts = options.get("opts", "") opts += " --reference_sequence %s --input_file:normal %s --input_file:tumor %s" % (ref, normal, tumour) if dbsnp: opts += " -B:dbsnp,VCF %s" % dbsnp if cosmic: opts += " -B:cosmic,VCF %s" % cosmic cl = [" ".join(["java -jar", options.get("javamem", default.get("javamem")), path(options.get("mutect_home", default.get("mutect_home"))) / "muTect.jar", "--analysis_type MuTect", "--out %s" % OUTPUT, "--coverage_file %s" % coverage, "--log_to_file muTect_paired.log", opts])] run_cmd(cl, normal, OUTPUT, options.get("run"), msg="Running muTect_paired")
def pigz(options): """Run pigz on a bunch of files. Options (set in sys.pigz section by default). pattern file glob to look for. Default: None opts command line options to pass to pbzip2. Default: -v decompress decompress file. Default: False recursive recursive search. Default: False """ options.order('pigz') glob_str = options.get("pattern", "") opts = options.get('opts', "-v") decompress = options.get('decompress', False) recursive = options.get('recursive', False) files = [] if decompress: opts = opts + "d" if not recursive: files = glob.glob(glob_str) else: for root, dirnames, filenames in os.walk(os.getcwd()): for filename in fnmatch.filter(filenames, glob_str): files.append(os.path.join(root, filename)) cl = [] if len(files) > 0: for f in files: if not os.path.islink(f): cl.append(" ".join(['pigz', opts, f])) run_cmd(cl, files[0], None, options.run, "Running pigz on %s" % " ".join(files)) else: print >> sys.stderr, "No files to process"