def _unified_genotyper(picard, align_bam, ref_file, dbsnp=None): """Perform SNP genotyping on the given alignment file. """ out_file = "%s-snp.vcf" % os.path.splitext(align_bam)[0] params = ["-T", "UnifiedGenotyper", "-I", align_bam, "-R", ref_file, "-o", out_file, "-A", "DepthOfCoverage", "-A", "AlleleBalance", "-A", "HomopolymerRun", "-A", "QualByDepth", "--genotype_likelihoods_model", "SNP", "-baq", "CALCULATE_AS_NECESSARY", "--standard_min_confidence_threshold_for_calling", "10.0", "--standard_min_confidence_threshold_for_emitting", "10.0", #"--trigger_min_confidence_threshold_for_calling", "10.0", #"--trigger_min_confidence_threshold_for_emitting", "10.0", "--downsample_to_coverage", 10000, "--min_base_quality_score", 20, "-l", "INFO", ] if dbsnp: params += ["-B:dbsnp,VCF", dbsnp] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): picard.run_gatk(params) return out_file
def picard_fastq_to_bam(picard, fastq_one, fastq_two, out_dir, platform, sample_name="", rg_name="", pu_name=""): """Convert fastq file(s) to BAM, adding sample, run group and platform information. """ qual_formats = {"illumina" : "Illumina"} try: qual_format = qual_formats[platform.lower()] except KeyError: raise ValueError("Need to specify quality format for %s" % platform) out_bam = os.path.join(out_dir, "%s.bam" % os.path.splitext(os.path.basename(fastq_one))[0]) if not (os.path.exists(out_bam) and os.path.getsize(out_bam) > 0): with curdir_tmpdir() as tmp_dir: opts = [("FASTQ", fastq_one), ("QUALITY_FORMAT", qual_format), ("READ_GROUP_NAME", rg_name), ("SAMPLE_NAME", sample_name), ("PLATFORM_UNIT", pu_name), ("PLATFORM", platform), ("TMP_DIR", tmp_dir), ("OUTPUT", out_bam)] if fastq_two: opts.append(("FASTQ2", fastq_two)) with file_transaction(out_bam): picard.run("FastqToSam", opts) return out_bam
def variant_eval(vcf_in, ref_file, dbsnp, target_intervals, picard): """Evaluate variants in comparison with dbSNP reference. """ out_file = "%s.eval" % os.path.splitext(vcf_in)[0] params = [ "-T", "VariantEval", "-R", ref_file, "--eval", vcf_in, "--dbsnp", dbsnp, "-ST", "Filter", "-o", out_file, "-l", "INFO", ] if target_intervals: params.extend(["-L", target_intervals]) if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): picard.run_gatk(params) return out_file
def _variant_filtration(picard, snp_file, ref_file): """Filter out problematic SNP calls. Recommended Broad hard filtering for deep coverage exomes: QUAL < 30.0 || AB > 0.75 && DP > 40 || QD < 5.0 || HRun > 5 || SB > -0.10 """ out_file = "%s-filter%s" % os.path.splitext(snp_file) params = ["-T", "VariantFiltration", "-R", ref_file, "-o", out_file, "-B:variant,VCF", snp_file, "--filterName", "QUALFilter", "--filterExpression", "QUAL <= 50.0", "--filterName", "QDFilter", "--filterExpression", "QD < 5.0", "--filterName", "ABFilter", "--filterExpression", "AB > 0.75 && DP > 40", "--filterName", "HRunFilter", "--filterExpression", "HRun > 3.0", "--filterName", "SBFilter", "--filterExpression", "SB > -0.10", "-l", "INFO", ] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): picard.run_gatk(params) return out_file
def _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranch_file, filter_type): """Apply recalibration details, returning filtered VCF file. """ base, ext = os.path.splitext(snp_file) out_file = "{base}-{filter}filter{ext}".format(base=base, ext=ext, filter=filter_type) params = [ "-T", "ApplyRecalibration", "-R", ref_file, "--input", snp_file, "--out", out_file, "--tranches_file", tranch_file, "--recal_file", recal_file, "--mode", filter_type, ] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): broad_runner.run_gatk(params) return out_file
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config): """Split a fastq file into multiplex pieces using barcode details. """ if not multiplex: return [("", "", fastq1, fastq2)] bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name) nomatch_file = "%s_unmatched_1_fastq.txt" % base_name metrics_file = "%s_bc.metrics" % base_name out_files = [] for info in multiplex: fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x)) bc_file1 = fq_fname("1") bc_file2 = fq_fname("2") if fastq2 else None out_files.append((info["barcode_id"], info["name"], bc_file1, bc_file2)) with utils.chdir(bc_dir): if not os.path.exists(nomatch_file) and not os.path.exists(metrics_file): tag_file = _make_tag_file(multiplex) cl = [config["program"]["barcode"], tag_file, "%s_--b--_--r--_fastq.txt" % base_name, fastq1] if fastq2: cl.append(fastq2) cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"]) cl.append("--metrics=%s" % metrics_file) if int(config["algorithm"]["bc_read"]) == 2: cl.append("--second") if int(config["algorithm"]["bc_position"]) == 5: cl.append("--five") if config["algorithm"].get("bc_allow_indels", True) is False: cl.append("--noindel") with utils.file_transaction(out_files + [nomatch_file, metrics_file]): subprocess.check_call(cl) out_files = [(b, n, f1, f2) for (b, n, f1, f2) in out_files if os.path.exists(f1)] return out_files
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None, region=None, out_file=None, deep_coverage=False): """Generate a list of interval regions for realignment around indels. """ if out_file: out_file = "%s.intervals" % os.path.splitext(out_file)[0] else: out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0] params = ["-T", "RealignerTargetCreator", "-I", align_bam, "-R", ref_file, "-o", out_file, "-l", "INFO", ] if region: params += ["-L", region] if dbsnp: params += ["--known", dbsnp] if deep_coverage: params += ["--mismatchFraction", "0.30", "--maxIntervalSize", "650"] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): runner.run_gatk(params) return out_file
def _gatk_count_covariates(picard, dup_align_bam, ref_file, platform, snp_file): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] params = ["-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-cov", "TileCovariate", "-recalFile", out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["-B:dbsnp,VCF", snp_file] if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file): picard.run_gatk(params, tmp_dir) return out_file
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config): qual_format = config["algorithm"].get("quality_format", None) if qual_format is None or qual_format.lower() == "illumina": qual_flags = ["--solexa1.3-quals"] else: qual_flags = [] out_dir = os.path.join(align_dir, "%s_tophat" % out_base) safe_makedir(out_dir) out_file = os.path.join(out_dir, _out_fnames[0]) files = [ref_file, fastq_file] if not os.path.exists(out_file): cl = [config["program"].get("tophat", "tophat")] cl += qual_flags cl += ["-m", str(config["algorithm"].get("max_errors", 0)), "--output-dir", out_dir, "--no-convert-bam"] if pair_file: d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, out_dir, config) cl += ["--mate-inner-dist", str(d), "--mate-std-dev", str(d_stdev)] files.append(pair_file) cl += files with file_transaction([os.path.join(out_dir, f) for f in _out_fnames]): child = subprocess.check_call(cl) out_file_final = os.path.join(out_dir, "%s.sam" % out_base) os.symlink(out_file, out_file_final) return out_file_final
def _variant_filtration_indel(broad_runner, snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ filter_type = "INDEL" cov_interval = config["algorithm"].get("coverage_interval", "exome").lower() params, recal_file, tranches_file = _shared_variant_filtration(filter_type, snp_file, ref_file) if cov_interval in ["exome", "regional"]: return _variant_filtration_no_recal( broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"] ) else: assert vrn_files.train_indels, "Need indel training file specified" params.extend( [ "-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0", vrn_files.train_indels, "-an", "QD", "-an", "FS", "-an", "HaplotypeScore", "-an", "ReadPosRankSum", ] ) if not (os.path.exists(recal_file) and os.path.getsize(recal_file) > 0): with file_transaction(recal_file, tranches_file): broad_runner.run_gatk(params) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, deep_coverage=False): """Perform realignment of BAM file in specified regions """ out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] params = [ "-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", out_file, "-l", "INFO", ] if deep_coverage: params += [ "--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100", ] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file): runner.run_gatk(params, tmp_dir) return out_file
def picard_index(picard, in_bam): index_file = "%s.bai" % in_bam if not os.path.exists(index_file): opts = [("INPUT", in_bam), ("OUTPUT", index_file)] with file_transaction(index_file): picard.run("BuildBamIndex", opts) return index_file
def _collect_align_metrics(self, dup_bam, ref_file): base, ext = os.path.splitext(dup_bam) align_metrics = "%s.align_metrics" % base if not os.path.exists(align_metrics): opts = [("INPUT", dup_bam), ("OUTPUT", align_metrics), ("R", ref_file)] with file_transaction(align_metrics): self._picard.run("CollectAlignmentSummaryMetrics", opts) return align_metrics
def bam_to_wig(bam_file, config, config_file): """Provide a BigWig coverage file of the sorted alignments. """ wig_file = "%s.bigwig" % os.path.splitext(bam_file)[0] if not (os.path.exists(wig_file) and os.path.getsize(wig_file) > 0): cl = [config["analysis"]["towig_script"], bam_file, config_file] with file_transaction(wig_file): subprocess.check_call(cl) return wig_file
def _insert_sizes(self, dup_bam): base, ext = os.path.splitext(dup_bam) insert_metrics = "%s.insert_metrics" % base insert_graph = "%s-insert.pdf" % base if not os.path.exists(insert_metrics): opts = [("INPUT", dup_bam), ("OUTPUT", insert_metrics), ("H", insert_graph)] with file_transaction(insert_graph, insert_metrics): self._picard.run("CollectInsertSizeMetrics", opts) return insert_graph, insert_metrics
def _gc_bias(self, dup_bam, ref_file): base, ext = os.path.splitext(dup_bam) gc_metrics = "%s.gc_metrics" % base gc_graph = "%s-gc.pdf" % base if not os.path.exists(gc_metrics): opts = [("INPUT", dup_bam), ("OUTPUT", gc_metrics), ("CHART", gc_graph), ("R", ref_file)] with file_transaction(gc_graph, gc_metrics): self._picard.run("CollectGcBiasMetrics", opts) return gc_graph, gc_metrics
def picard_index_ref(picard, ref_file): """Provide a Picard style dict index file for a reference genome. """ dict_file = "%s.dict" % os.path.splitext(ref_file)[0] if not os.path.exists(dict_file): opts = [("REFERENCE", ref_file), ("OUTPUT", dict_file)] with file_transaction(dict_file): picard.run("CreateSequenceDictionary", opts) return dict_file
def unified_genotyper(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) broad_runner.run_fn("picard_index", align_bam) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() if coverage_depth in ["low"]: confidence = "4.0" else: confidence = "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] params = [ "-T", "UnifiedGenotyper", "-I", align_bam, "-R", ref_file, "-o", out_file, "--annotation", "QualByDepth", "--annotation", "HaplotypeScore", "--annotation", "MappingQualityRankSumTest", "--annotation", "ReadPosRankSumTest", "--annotation", "FisherStrand", "--annotation", "RMSMappingQuality", "--annotation", "DepthOfCoverage", "--genotype_likelihoods_model", "BOTH", "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--min_mapping_quality_score", "20", "-l", "INFO", ] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): broad_runner.run_gatk(params) return out_file
def picard_mark_duplicates(picard, align_bam): base, ext = os.path.splitext(align_bam) dup_bam = "%s-dup%s" % (base, ext) dup_metrics = "%s-dup.dup_metrics" % base if not os.path.exists(dup_bam): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", dup_bam), ("TMP_DIR", tmp_dir), ("METRICS_FILE", dup_metrics)] with file_transaction(dup_bam, dup_metrics): picard.run("MarkDuplicates", opts) return dup_bam, dup_metrics
def split_snps_indels(broad_runner, orig_file, ref_file): """Split a variant call file into SNPs and INDELs for processing. """ base, ext = os.path.splitext(orig_file) snp_file = "{base}-snp{ext}".format(base=base, ext=ext) indel_file = "{base}-indel{ext}".format(base=base, ext=ext) params = ["-T", "SelectVariants", "-R", ref_file, "--variant", orig_file] for out_file, select_type in [(snp_file, "SNP"), (indel_file, "INDEL")]: if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): cur_params = params + ["--out", out_file, "--selectTypeToInclude", select_type] with file_transaction(out_file): broad_runner.run_gatk(cur_params) return snp_file, indel_file
def align(out_dir, ref_index, fastq1, fastq2=None, qual_format=None): out_file = os.path.join(out_dir, "%s.sam" % _get_base_filename(fastq1)) if not os.path.exists(out_file): cl = ["novoalign", "-o", "SAM", "-r", "None", "-d", ref_index, "-f", fastq1] if fastq2: cl.append(fastq2) if qual_format: cl += ["-F", qual_format] print " ".join(cl) with file_transaction(out_file): with open(out_file, "w") as out_handle: subprocess.check_call(cl, stdout=out_handle) return out_file
def _run_snpeff(snp_in, genome, snpeff_jar, se_interval): snpeff_config = "%s.config" % os.path.splitext(snpeff_jar)[0] out_file = "%s-effects.tsv" % (os.path.splitext(snp_in)[0]) if not os.path.exists(out_file): cl = ["java", "-jar", snpeff_jar, "-1", "-vcf4", "-pass", "-c", snpeff_config, genome, snp_in] if se_interval: cl.extend(["-filterInterval", se_interval]) print " ".join(cl) with file_transaction(out_file): with open(out_file, "w") as out_handle: subprocess.check_call(cl, stdout=out_handle) return out_file
def picard_fixmate(picard, align_bam): """Run Picard's FixMateInformation generating an aligned output file. """ base, ext = os.path.splitext(align_bam) out_file = "%s-sort%s" % (base, ext) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", out_file), ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")] with file_transaction(out_file): picard.run("FixMateInformation", opts) return out_file
def picard_sort(picard, align_bam): """Sort a BAM file by coordinates. """ base, ext = os.path.splitext(align_bam) out_file = "%s-sort%s" % (base, ext) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", out_file), ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")] with file_transaction(out_file): picard.run("SortSam", opts) return out_file
def annotate_effects(orig_file, snpeff_file, genome_file, config): """Annotate predicted variant effects using snpEff. """ broad_runner = broad.runner_from_config(config) out_file = "%s-annotated%s" % os.path.splitext(orig_file) params = ["-T", "VariantAnnotator", "-R", genome_file, "-A", "SnpEff", "--variant", orig_file, "--snpEffFile", snpeff_file, "--out", out_file] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): broad_runner.run_gatk(params) return out_file
def combine_variant_files(orig_files, out_file, ref_file, config): """Combine multiple VCF files into a single output file. """ broad_runner = broad.runner_from_config(config) params = ["-T", "CombineVariants", "-R", ref_file, "--out", out_file] priority_order = [] for orig_file in orig_files: name = os.path.splitext(os.path.basename(orig_file))[0] params.extend(["--variant:{name}".format(name=name), orig_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): broad_runner.run_gatk(params) return out_file
def _variant_filtration_no_recal(broad_runner, snp_file, ref_file, filter_type, expressions): """Perform hard filtering if coverage is in limited regions. Variant quality score recalibration will not work on some regions; it requires enough positions to train the model. """ base, ext = os.path.splitext(snp_file) out_file = "{base}-filter{ftype}{ext}".format(base=base, ext=ext, ftype=filter_type) params = ["-T", "VariantFiltration", "-R", ref_file, "--out", out_file, "--variant", snp_file] for exp in expressions: params.extend(["--filterName", "GATKStandard{e}".format(e=exp.split()[0]), "--filterExpression", exp]) if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with file_transaction(out_file): broad_runner.run_gatk(params) return out_file
def picard_merge(picard, in_files, out_file=None): """Merge multiple BAM files together with Picard. """ if out_file is None: out_file = "%smerge.bam" % os.path.commonprefix(in_files) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"), ("TMP_DIR", tmp_dir)] for in_file in in_files: opts.append(("INPUT", in_file)) with file_transaction(out_file): picard.run("MergeSamFiles", opts) return out_file
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config): """Perform a BWA alignment, generating a SAM file. """ sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not os.path.exists(sam_file): if not os.path.exists(sai1_file): with file_transaction(sai1_file): _run_bwa_align(fastq_file, ref_file, sai1_file, config) if sai2_file and not os.path.exists(sai2_file): with file_transaction(sai2_file): _run_bwa_align(pair_file, ref_file, sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file): with open(sam_file, "w") as out_handle: subprocess.check_call(sam_cl, stdout=out_handle) return sam_file
def _hybrid_select_metrics(self, dup_bam, bait_file, target_file): """Generate metrics for hybrid selection efficiency. """ base, ext = os.path.splitext(dup_bam) metrics = "%s.hs_metrics" % base if not os.path.exists(metrics): with bed_to_interval(bait_file, dup_bam) as ready_bait: with bed_to_interval(target_file, dup_bam) as ready_target: opts = [("BAIT_INTERVALS", ready_bait), ("TARGET_INTERVALS", ready_target), ("INPUT", dup_bam), ("OUTPUT", metrics)] with file_transaction(metrics): self._picard.run("CalculateHsMetrics", opts) return metrics