Example #1
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    if dd.is_set_coverage_depth_max(data):
        coverage_depth_max = dd.get_coverage_depth_max(data)
        # GATK can only downsample to a minimum of 200
        coverage_depth_max = max([200, coverage_depth_max])
        params += ["--downsample_to_coverage", str(coverage_depth_max),
                   "--downsampling_type", "BY_SAMPLE"]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Example #2
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for an input BAM in the given region.
    """
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    max_depth = dd.get_coverage_depth_max(data)
    depth = {"max": max_depth * 7 if max_depth > 0 else sys.maxint - 1,
             "min": dd.get_coverage_depth_min(data)}
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            ref_file = tz.get_in(["reference", "fasta", "base"], data)
            region_file, calc_callable = _regions_for_coverage(data, region, ref_file, tx_out_file)
            if calc_callable:
                _group_by_ctype(_get_coverage_file(data["work_bam"], ref_file, region, region_file, depth,
                                                   tx_out_file, data),
                                depth, region_file, tx_out_file, data)
            # special case, do not calculate if we are in a chromosome not covered by BED file
            else:
                os.rename(region_file, tx_out_file)
    return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
Example #3
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    if dd.is_set_coverage_depth_max(data):
        coverage_depth_max = dd.get_coverage_depth_max(data)
        # GATK can only downsample to a minimum of 200
        coverage_depth_max = max([200, coverage_depth_max])
        params += [
            "--downsample_to_coverage",
            str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE"
        ]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence,
            "--standard_min_confidence_threshold_for_emitting", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return broad_runner, params