Exemple #1
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        bam.index(x, config)
    if num_cores > 1 and broad_runner.gatk_type() == "gatk4":
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Exemple #2
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = "%s-recal.grp" % os.path.splitext(dup_align_bam)[0]
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--sparkMaster",
                        "local[%s]" % dd.get_num_cores(data), "--output",
                        tx_out_file, "--reference",
                        dd.get_ref_twobit(data)
                    ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += [
                        "-L", intervals, "--interval_set_rule", "INTERSECTION"
                    ]
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file))
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Exemple #3
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    if _use_spark(num_cores, gatk_type):
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += ["-T", "BaseRecalibratorSpark",
                               "--sparkMaster", "local[%s]" % cores,
                               "--output", tx_out_file, "--reference", dd.get_ref_twobit(data),
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
                else:
                    params += ["-T", "BaseRecalibrator",
                                "-o", tx_out_file, "-R", ref_file]
                    downsample_pct = bam.get_downsample_pct(dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += ["--downsample_to_fraction", str(downsample_pct),
                                   "--downsampling_type", "ALL_READS"]
                    if platform.lower() == "solid":
                        params += ["--solid_nocall_strategy", "PURGE_READ",
                                   "--solid_recal_mode", "SET_Q_ZERO_BASE_N"]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                      parallel_gc=True)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Exemple #5
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.

    spark host and timeout settings help deal with runs on restricted systems
    where we encounter network and timeout errors
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.grp" %
        utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--spark-master",
                        "local[%s]" % cores, "--output", tx_out_file,
                        "--reference",
                        dd.get_ref_twobit(data), "--conf",
                        "spark.driver.host=localhost", "--conf",
                        "spark.network.timeout=800", "--conf",
                        "spark.executor.heartbeatInterval=100", "--conf",
                        "spark.local.dir=%s" % os.path.dirname(tx_out_file)
                    ]
                    if dbsnp_file:
                        params += ["--known-sites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval-set-rule",
                            "INTERSECTION"
                        ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                memscale = {
                    "magnitude": 0.9 * cores,
                    "direction": "increase"
                } if cores > 1 else None
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale,
                                      parallel_gc=True)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file