Esempio n. 1
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 2
0
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's TNhaplotyper (MuTect2 like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.population_variant_regions(items, merged=True)
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper"
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else ""
            license = license_export(items[0])
            tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file)
            cores = dd.get_num_cores(items[0])
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                   "--algo TNhaplotyper "
                   "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                   "{dbsnp} {cosmic} {tx_orig_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper")
            cmd = ("gunzip -c {tx_orig_file} | "
                   "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | "
                   "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | "
                   "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | "
                   "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | "
                   "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible")
            vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])
    return out_file
Esempio n. 3
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    params += standard_cl_params(items)
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Esempio n. 4
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 5
0
def run_tnscope(align_bams,
                items,
                ref_file,
                assoc_files,
                region=None,
                out_file=None):
    """Call variants with Sentieon's TNscope somatic caller.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(
            bedutils.population_variant_regions(items), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope"
            dbsnp = "--dbsnp %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(items[0])
            cores = dd.get_num_cores(items[0])
            cmd = (
                "{license}sentieon driver -t {cores} -r {ref_file} "
                "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                "--algo TNscope "
                "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                "{dbsnp} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNscope")
    return out_file
Esempio n. 6
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    picard_runner = broad.runner_from_path("picard", config)
    picard_runner.run_fn("picard_index_ref", ref_file)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Esempio n. 7
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    if _use_spark(num_cores, gatk_type):
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Esempio n. 8
0
def run_haplotyper(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(
            bedutils.population_variant_regions(items), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            dbsnp = "--dbsnp %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            bams = " ".join(["-i %s" % x for x in align_bams])
            license = license_export(items[0])
            cores = dd.get_num_cores(items[0])
            out_mode = "--emit_mode gvcf" if joint.want_gvcf(items) else ""
            cmd = (
                "{license}sentieon driver -t {cores} -r {ref_file} "
                "{bams} {interval} --algo Haplotyper {out_mode} {dbsnp} {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "Sentieon Haplotyper")
    return out_file
Esempio n. 9
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only smCounter2 calling.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" %
                                              (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs, region,
                                            out_file, items=items, do_merge=True)
    out_file = out_file.replace(".vcf.gz", ".vcf")
    out_prefix = utils.splitext_plus(os.path.basename(out_file))[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["smCounter2", "--runPath", os.path.dirname(tx_out_file),
                   "--outPrefix", out_prefix,
                   "--bedTarget", target, "--refGenome", ref_file,
                   "--bamFile", paired.tumor_bam, "--bamType", "consensus",
                   "--nCPU", dd.get_num_cores(paired.tumor_data)]
            do.run(cmd, "smcounter2 variant calling")
            for fname in glob.glob(os.path.join(os.path.dirname(tx_out_file), "*.smCounter*")):
                shutil.move(fname, os.path.join(os.path.dirname(out_file), os.path.basename(fname)))
            utils.symlink_plus(os.path.join(os.path.dirname(out_file),
                                            "%s.smCounter.cut.vcf" % out_prefix),
                               out_file)
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"], remove_orig=False,
                                    prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" %
                                    (out_prefix, dd.get_sample_name(paired.tumor_data),
                                     vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
Esempio n. 10
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file, items=items)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 11
0
def _config_params(base_config, assoc_files, region, out_file, items):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    # Output quality scores
    if "--enable_qscore_output" not in params:
        params.append("--enable_qscore_output")
    # drf not currently supported in MuTect to turn off duplicateread filter
    # params += gatk.standard_cl_params(items)
    return params
Esempio n. 12
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(
                    tz.get_in(["config", "algorithm", "coverage_interval"], x,
                              "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 13
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = (
                            "Region must be a tuple - something odd just happened"
                        )
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end),
                              file=out_handle)
        if any(dd.get_coverage_interval(x) == "genome" for x in items):
            target_bed = shared.remove_highdepth_regions(target_bed, items)
            target_bed = shared.remove_lcr_regions(target_bed, items)
        return ["--bed", target_bed]
    else:
        return []
Esempio n. 14
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    target = pshared.subset_variant_regions(variant_regions, region, base_file, items)
    if isinstance(target, basestring) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
Esempio n. 15
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    target = pshared.subset_variant_regions(variant_regions, region, base_file, items)
    if isinstance(target, basestring) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
Esempio n. 16
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    target = pshared.subset_variant_regions(variant_regions, region, base_file, items)
    if isinstance(target, six.string_types) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
Esempio n. 17
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    target = pshared.subset_variant_regions(variant_regions, region, base_file,
                                            items)
    if isinstance(target, six.string_types) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
Esempio n. 18
0
def _add_region_params(region, out_file, items):
    """Add parameters for selecting by region to command line.
    """
    params = []
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += gatk.standard_cl_params(items)
    return params
Esempio n. 19
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file, items)
        if target:
            if isinstance(target, basestring) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Esempio n. 20
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file, items)
        if target:
            if isinstance(target, six.string_types) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Esempio n. 21
0
def heterogzygote_counts(paired):
    """Provide tumor/normal counts at population heterozyogte sites with CollectAllelicCounts.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(paired.tumor_data), "structural", "counts"))
    key = "germline_het_pon"
    het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
    vr = bedutils.population_variant_regions([x for x in [paired.tumor_data, paired.normal_data] if x])
    cur_het_bed = bedutils.intersect_two(het_bed, vr, work_dir, paired.tumor_data)
    tumor_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.tumor_data)
    normal_counts = (_run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.normal_data)
                     if paired.normal_data else None)
    return tumor_counts, normal_counts
Esempio n. 22
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file,
                                        items)
        if target:
            if isinstance(target, six.string_types) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Esempio n. 23
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run octopus variant calling, handling both somatic and germline calling.
    """
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        vrs = bedutils.population_variant_regions(items)
        target = shared.subset_variant_regions(vrs, region,
                                               out_file, items=items, do_merge=True)
        if paired:
            return _run_somatic(paired, ref_file, target, out_file)
        else:
            return _run_germline(align_bams, items, ref_file, target, out_file)
    return out_file
Esempio n. 24
0
def _get_region_bed(region, items, out_file):
    """Retrieve BED file of regions to analyze, either single or multi-region.
    """
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if not target:
        raise ValueError("Need BED input for strelka2 regions: %s %s" % (region, target))
    if not isinstance(target, basestring) or not os.path.isfile(target):
        chrom, start, end = target
        target = "%s-regions.bed" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], target) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    return bedutils.merge_overlaps(target, items[0], out_dir=os.path.dirname(out_file)) + ".gz"
Esempio n. 25
0
def heterogzygote_counts(paired):
    """Provide tumor/normal counts at population heterozyogte sites with CollectAllelicCounts.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(paired.tumor_data), "structural", "counts"))
    key = "germline_het_pon"
    het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
    vr = bedutils.population_variant_regions([x for x in [paired.tumor_data, paired.normal_data] if x])
    cur_het_bed = bedutils.intersect_two(het_bed, vr, work_dir, paired.tumor_data)
    tumor_counts = _run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.tumor_data)
    normal_counts = (_run_collect_allelic_counts(cur_het_bed, key, work_dir, paired.normal_data)
                     if paired.normal_data else None)
    if normal_counts:
        tumor_counts, normal_counts = _filter_by_normal(tumor_counts, normal_counts, paired.tumor_data)
    return tumor_counts, normal_counts
Esempio n. 26
0
def _get_region_bed(region, items, out_file):
    """Retrieve BED file of regions to analyze, either single or multi-region.
    """
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if not target:
        raise ValueError("Need BED input for strelka2 regions: %s %s" % (region, target))
    if not isinstance(target, basestring) or not os.path.isfile(target):
        chrom, start, end = target
        target = "%s-regions.bed" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], target) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    return bedutils.merge_overlaps(target, items[0], out_dir=os.path.dirname(out_file)) + ".gz"
Esempio n. 27
0
def _get_regions(region, out_file, items):
    """Retrieve region to run analysis in. Handles no targets, BED and regions.
    """
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs,
                                           region,
                                           out_file,
                                           items=items,
                                           do_merge=True)
    if target:
        if isinstance(target, six.string_types) and os.path.isfile(target):
            return "--regions-file %s" % target
        else:
            return "--regions %s" % bamprep.region_to_gatk(target)
    else:
        return ""
Esempio n. 28
0
def get_region_bed(region, items, out_file, want_gzip=True):
    """Retrieve BED file of regions to analyze, either single or multi-region.
    """
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if not target:
        raise ValueError("Need BED input for strelka2 regions: %s %s" % (region, target))
    if not isinstance(target, six.string_types) or not os.path.isfile(target):
        chrom, start, end = target
        target = "%s-regions.bed" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], target) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    out_file = target
    if want_gzip:
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Esempio n. 29
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    cur_ploidy = ploidy.get_ploidy(items, region)
    base_ploidy = ploidy.get_ploidy(items)
    opts += ["--ploidy", str(cur_ploidy)]
    # Adjust min fraction when trying to call more sensitively in certain
    # regions. This is primarily meant for pooled mitochondrial calling.
    if (isinstance(region,
                   (list, tuple)) and chromhacks.is_mitochondrial(region[0])
            and cur_ploidy >= base_ploidy
            and "--min-alternate-fraction" not in opts and "-F" not in opts):
        opts += ["--min-alternate-fraction", "0.01"]
    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(
                    tz.get_in(["config", "algorithm", "coverage_interval"], x,
                              "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 30
0
def run_haplotyper(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.population_variant_regions(items, merged=True)
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            bams = " ".join(["-i %s" % x for x in align_bams])
            license = license_export(items[0])
            cores = dd.get_num_cores(items[0])
            out_mode = "--emit_mode gvcf" if joint.want_gvcf(items) else ""
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "{bams} {interval} --algo Haplotyper {out_mode} {dbsnp} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon Haplotyper")
    return out_file
Esempio n. 31
0
def run_tnhaplotyper(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variants with Sentieon's TNhaplotyper (MuTect2 like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(
            bedutils.population_variant_regions(items), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper"
            dbsnp = "--dbsnp %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            cosmic = "--cosmic %s" % (
                assoc_files.get("cosmic")) if "cosmic" in assoc_files else ""
            license = license_export(items[0])
            tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file)
            cores = dd.get_num_cores(items[0])
            cmd = (
                "{license}sentieon driver -t {cores} -r {ref_file} "
                "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                "--algo TNhaplotyper "
                "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                "{dbsnp} {cosmic} {tx_orig_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper")
            cmd = (
                "gunzip -c {tx_orig_file} | "
                "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | "
                "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | "
                "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | "
                "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | "
                "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | "
                "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Sentieon TNhaplotyper: make headers GATK compatible")
            vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])
    return out_file
Esempio n. 32
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = bedutils.population_variant_regions(items)
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = ("Region must be a tuple - something odd just happened")
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        return ["--bed", remove_lcr_regions(target_bed, items)]
    else:
        return []
Esempio n. 33
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, (
        "Pisces supports tumor-only variant calling: %s" %
        (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs,
                                           region,
                                           out_file,
                                           items=items,
                                           do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(
            paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file),
                                   paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            cmd = (
                "pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                "--maxthreads {cores} --minvf {min_af} --ploidy somatic --gvcf false -o {out_dir}"
            )
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(
            raw_file,
            paired.tumor_data["config"],
            prep_cmd="sed 's#%s.bam#%s#' | %s" %
            (base_out_name, dd.get_sample_name(paired.tumor_data),
             vcfutils.add_contig_to_header_cl(
                 dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
Esempio n. 34
0
def run_tnscope(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's TNscope somatic caller.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.population_variant_regions(items, merged=True)
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope"
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(items[0])
            cores = dd.get_num_cores(items[0])
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                   "--algo TNscope "
                   "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                   "{dbsnp} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNscope")
    return out_file
Esempio n. 35
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" %
                                              (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs, region,
                                            out_file, items=items, do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file), paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            emit_min_af = min_af / 10.0
            cmd = ("pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                   "--maxthreads {cores} --minvf {emit_min_af} --vffilter {min_af} "
                   "--ploidy somatic --gvcf false -o {out_dir}")
            # Recommended filtering for low frequency indels
            # https://github.com/bcbio/bcbio-nextgen/commit/49d0cbb1f6dcbea629c63749e2f9813bd06dcee3#commitcomment-29765373
            cmd += " -RMxNFilter 5,9,0.35"
            # For low frequency UMI tagged variants, set higher variant thresholds
            # https://github.com/Illumina/Pisces/issues/14#issuecomment-399756862
            if min_af < (1.0 / 100.0):
                cmd += " --minbasecallquality 30"
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(raw_file, paired.tumor_data["config"],
                                 prep_cmd="sed 's#%s.bam#%s#' | %s" %
                                 (base_out_name, dd.get_sample_name(paired.tumor_data),
                                  vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
Esempio n. 36
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    cur_ploidy = ploidy.get_ploidy(items, region)
    base_ploidy = ploidy.get_ploidy(items)
    opts += ["--ploidy", str(cur_ploidy)]
    # Adjust min fraction when trying to call more sensitively in certain
    # regions. This is primarily meant for pooled mitochondrial calling.
    if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0])
          and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts):
        opts += ["--min-alternate-fraction", "0.01"]
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                   for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
Esempio n. 37
0
def _config_params(base_config, assoc_files, region, out_file, items):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += [
            "--minimum_mutation_cell_fraction",
            "%.2f" % (min_af / 100.0)
        ]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    # Output quality scores
    if "--enable_qscore_output" not in params:
        params.append("--enable_qscore_output")
    # drf not currently supported in MuTect to turn off duplicateread filter
    # params += gatk.standard_cl_params(items)
    return params
Esempio n. 38
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Esempio n. 39
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        """| %s -c 'from bcbio.variation import freebayes; """
                        """freebayes.call_somatic("%s", "%s")' """ %
                        (sys.executable, paired.tumor_name,
                         paired.normal_name))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' "
                        "| %s "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable),
                                        "py"), _lowfreq_linear_filter(0, True),
                           os.path.join(os.path.dirname(sys.executable), "py"),
                           0, bam.aligner_from_header(paired.tumor_bam)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| awk 'NF>=48' | testsomatic.R "
                    "| var2vcf_paired.pl -P 0.9 -m 4.25 {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "| {contig_cl} {freq_filter} "
                    "| bcftools filter -i 'QUAL >= 0' "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    return out_file
Esempio n. 40
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {contig_cl} | bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    return out_file
Esempio n. 41
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(_vardict_options_from_config(items, config, out_file, target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
                cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                        "-N {sample} -b {bamfile} {opts} "
                        "| {strandbias}"
                        "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                        "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                                out_file=tx_out_file, ref_file=ref_file,
                                                config=config, region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Esempio n. 42
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(
                vrs, region, out_file, items=items, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                lowfreq_filter = _lowfreq_linear_filter(0, False)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| teststrandbias.R "
                       "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} "
                       "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    return out_file
Esempio n. 43
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs, region,
                                                   out_file, items=items, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      """| %s -c 'from bcbio.variation import freebayes; """
                                      """freebayes.call_somatic("%s", "%s")' """
                                      % (sys.executable, paired.tumor_name, paired.normal_name))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' "
                                   "| %s "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                    _lowfreq_linear_filter(0, True),
                                    os.path.join(os.path.dirname(sys.executable), "py"),
                                    0, bam.aligner_from_header(paired.tumor_bam)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| awk 'NF>=48' | testsomatic.R "
                       "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       "| {contig_cl} {freq_filter} "
                       "| bcftools filter -i 'QUAL >= 0' "
                       "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file