def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) for x in align_bams: bam.index(x, config) if num_cores > 1 and broad_runner.gatk_type() == "gatk4": # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = "%s-recal.grp" % os.path.splitext(dup_align_bam)[0] if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] if gatk_type == "gatk4": params += [ "-T", "BaseRecalibratorSpark", "--sparkMaster", "local[%s]" % dd.get_num_cores(data), "--output", tx_out_file, "--reference", dd.get_ref_twobit(data) ] else: params += [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, os.path.dirname(tx_out_file)) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) if _use_spark(num_cores, gatk_type): # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0]) if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] cores = dd.get_num_cores(data) if gatk_type == "gatk4": params += ["-T", "BaseRecalibratorSpark", "--sparkMaster", "local[%s]" % cores, "--output", tx_out_file, "--reference", dd.get_ref_twobit(data), "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file] downsample_pct = bam.get_downsample_pct(dup_align_bam, target_counts, data) if downsample_pct: params += ["--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS"] if platform.lower() == "solid": params += ["--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. spark host and timeout settings help deal with runs on restricted systems where we encounter network and timeout errors """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0]) if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] cores = dd.get_num_cores(data) if gatk_type == "gatk4": params += [ "-T", "BaseRecalibratorSpark", "--spark-master", "local[%s]" % cores, "--output", tx_out_file, "--reference", dd.get_ref_twobit(data), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100", "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file) ] if dbsnp_file: params += ["--known-sites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval-set-rule", "INTERSECTION" ] else: params += [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file