Beispiel #1
0
def _produce_compatible_vcf(out_file, data, is_somatic):
    """Create a compatible VCF that downstream tools can deal with.

    - htsjdk and thus GATK and Picard do not support VCF4.3:
      https://github.com/broadinstitute/gatk/issues/2092
    - Use octopus legacy format to avoid incompatibilities.
      https://github.com/luntergroup/octopus#output-format
    - Fixes `##contig` lines since octopus only writes contigs
      used in the BED file region, causing incompatibilies with
      GatherVcfs when merging
    - Fixes alleles prefixed with '*' like 'C,*T' which cause
      downstream failures when reading with GATK.
    """
    base, ext = utils.splitext_plus(out_file)
    legacy_file = "%s.legacy%s" % (base, ext)
    if is_somatic:
        legacy_file = _covert_to_diploid(legacy_file, data)
    final_file = "%s.vcf.gz" % base
    cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat"
    contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file)
    remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'"
    cmd = ("{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | "
           "{remove_problem_alleles} | {contig_cl} | bgzip -c > {final_file}")
    do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus")
    return vcfutils.bgzip_and_index(out_file, data["config"])
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only smCounter2 calling.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" %
                                              (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs, region,
                                            out_file, items=items, do_merge=True)
    out_file = out_file.replace(".vcf.gz", ".vcf")
    out_prefix = utils.splitext_plus(os.path.basename(out_file))[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["smCounter2", "--runPath", os.path.dirname(tx_out_file),
                   "--outPrefix", out_prefix,
                   "--bedTarget", target, "--refGenome", ref_file,
                   "--bamFile", paired.tumor_bam, "--bamType", "consensus",
                   "--nCPU", dd.get_num_cores(paired.tumor_data)]
            do.run(cmd, "smcounter2 variant calling")
            for fname in glob.glob(os.path.join(os.path.dirname(tx_out_file), "*.smCounter*")):
                shutil.move(fname, os.path.join(os.path.dirname(out_file), os.path.basename(fname)))
            utils.symlink_plus(os.path.join(os.path.dirname(out_file),
                                            "%s.smCounter.cut.vcf" % out_prefix),
                               out_file)
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"], remove_orig=False,
                                    prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" %
                                    (out_prefix, dd.get_sample_name(paired.tumor_data),
                                     vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
Beispiel #3
0
def _produce_compatible_vcf(out_file, data, is_somatic=False):
    """Create a compatible VCF that downstream tools can deal with.

    - htsjdk and thus GATK and Picard do not support VCF4.3:
      https://github.com/broadinstitute/gatk/issues/2092
    - Use octopus legacy format to avoid incompatibilities.
      https://github.com/luntergroup/octopus#output-format
    - Fixes `##contig` lines since octopus only writes contigs
      used in the BED file region, causing incompatibilies with
      GatherVcfs when merging
    - Fixes alleles prefixed with '*' like 'C,*T' which cause
      downstream failures when reading with GATK.
    - Changes phase set (PS) header to be type Integer.
    """
    base, ext = utils.splitext_plus(out_file)
    #legacy_file = "%s.legacy%s" % (base, ext)
    legacy_file = out_file
    if is_somatic:
        legacy_file = _covert_to_diploid(legacy_file, data)
    final_file = "%s.vcf.gz" % base
    cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat"
    contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data),
                                                 out_file)
    remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'"
    fix_phasing_header = r"sed 's/ID=PS,Number=1,Type=String/ID=PS,Number=1,Type=Integer/'"
    cmd = (
        "{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | "
        "{remove_problem_alleles} | {fix_phasing_header} | {contig_cl} | bgzip -c > {final_file}"
    )
    do.run(cmd.format(**locals()),
           "Produce compatible VCF output file from octopus")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #4
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            add_contig = vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file)
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} | {add_contig} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    return out_file
Beispiel #5
0
def _add_contig_cl(in_file, items):
    has_contigs = False
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("##contig"):
                has_contigs = True
                break
            elif not line.startswith("##"):
                break
    if not has_contigs:
        return vcfutils.add_contig_to_header_cl(items[0])
Beispiel #6
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(
                tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = (
                "{perl_exports} && "
                "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            add_contig = vcfutils.add_contig_to_header_cl(
                dd.get_ref_file(items[0]), tx_out_file)
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                "| {add_contig} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    return out_file
def _add_contig_cl(in_file, items):
    has_contigs = False
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("##contig"):
                has_contigs = True
                break
            elif not line.startswith("##"):
                break
    if not has_contigs:
        return vcfutils.add_contig_to_header_cl(items[0])
def map_coords_to_ucsc(grc_cosmic, ref_file, out_file):
    hg19_ref_file = ref_file.replace("GRCh37", "hg19")
    if not os.path.exists(out_file):
        contig_cl = vcfutils.add_contig_to_header_cl(hg19_ref_file, out_file)
        cmd = ("zcat {grc_cosmic} | "
               r'sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
               "| {contig_cl} "
               "| bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if os.path.exists("%s-header.txt" % utils.splitext_plus(out_file)[0]):
        os.remove("%s-header.txt" % utils.splitext_plus(out_file)[0])
    return vcfutils.bgzip_and_index(out_file, {})
def map_coords_to_ucsc(grc_cosmic, ref_file, out_file):
    hg19_ref_file = ref_file.replace("GRCh37", "hg19")
    if not os.path.exists(out_file):
        contig_cl = vcfutils.add_contig_to_header_cl(hg19_ref_file, out_file)
        cmd = ("zcat {grc_cosmic} | "
               r'sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
               "| {contig_cl} "
               "| bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if os.path.exists("%s-header.txt" % utils.splitext_plus(out_file)[0]):
        os.remove("%s-header.txt" % utils.splitext_plus(out_file)[0])
    return vcfutils.bgzip_and_index(out_file, {})
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = [
                "platypus", "callVariants",
                "--regions=%s" % _subset_regions(region, out_file, items),
                "--bamFiles=%s" % ",".join(align_bams),
                "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                "--logFileName", "/dev/null", "--verbosity=1"
            ]
            resources = config_utils.get_resources("platypus",
                                                   items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (
                " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                "vcfstreamsort | bgzip -c > %s" %
                (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5),
                 vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]),
                                                  tx_out_file), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd,
                   "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
def sort_to_ref(fname, ref_file, add_chr):
    """Match reference genome ordering.
    """
    out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", ""))
    if not os.path.exists(out_file):
        if add_chr:
            fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
        else:
            fix_chrom = ''
        contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file)
        cmd = ("gunzip -c {fname} {fix_chrom} | "
               "gsort /dev/stdin {ref_file}.fai | {contig_cl} | "
               "bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return vcfutils.bgzip_and_index(out_file, {})
def sort_to_ref(fname, ref_file, add_chr):
    """Match reference genome ordering.
    """
    out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", ""))
    if not os.path.exists(out_file):
        if add_chr:
            fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
        else:
            fix_chrom = ''
        contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file)
        cmd = ("gunzip -c {fname} {fix_chrom} | "
               "bcftools norm --check-ref s --do-not-normalize -f {ref_file} |"
               "gsort /dev/stdin {ref_file}.fai | {contig_cl} | "
               "bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return vcfutils.bgzip_and_index(out_file, {})
Beispiel #13
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            resources = config_utils.get_resources("platypus", items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (" | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                                "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(),
                                                                   vcfutils.fix_ambiguous_cl(5),
                                                                   vcfutils.add_contig_to_header_cl(items[0]),
                                                                   tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Beispiel #14
0
def sort_to_ref(fname, ref_file, add_chr):
    """Match reference genome ordering.
    """
    logging.info(f"Sorting {fname} to match the order of {ref_file}.")
    out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", ""))
    if not os.path.exists(out_file):
        if add_chr:
            fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
        else:
            fix_chrom = ''
        contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file)
        cmd = ("gunzip -c {fname} {fix_chrom} | "
               "bcftools norm --check-ref s --do-not-normalize -f {ref_file} |"
               "bcftools view -e 'SNP=1' |"
               "gsort /dev/stdin {ref_file}.fai | {contig_cl} | "
               "bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    logging.info(f"bgzipping and indexing {out_file}.")
    return vcfutils.bgzip_and_index(out_file, {})
Beispiel #15
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = ("{perl_exports} && "
                   "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ")
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            add_contig = vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file)
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                   r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                   "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                   "| {add_contig} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    return out_file
Beispiel #16
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, (
        "Pisces supports tumor-only variant calling: %s" %
        (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs,
                                           region,
                                           out_file,
                                           items=items,
                                           do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(
            paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file),
                                   paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            cmd = (
                "pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                "--maxthreads {cores} --minvf {min_af} --ploidy somatic --gvcf false -o {out_dir}"
            )
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(
            raw_file,
            paired.tumor_data["config"],
            prep_cmd="sed 's#%s.bam#%s#' | %s" %
            (base_out_name, dd.get_sample_name(paired.tumor_data),
             vcfutils.add_contig_to_header_cl(
                 dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
Beispiel #17
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" %
                                              (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs, region,
                                            out_file, items=items, do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file), paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            emit_min_af = min_af / 10.0
            cmd = ("pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                   "--maxthreads {cores} --minvf {emit_min_af} --vffilter {min_af} "
                   "--ploidy somatic --gvcf false -o {out_dir}")
            # Recommended filtering for low frequency indels
            # https://github.com/bcbio/bcbio-nextgen/commit/49d0cbb1f6dcbea629c63749e2f9813bd06dcee3#commitcomment-29765373
            cmd += " -RMxNFilter 5,9,0.35"
            # For low frequency UMI tagged variants, set higher variant thresholds
            # https://github.com/Illumina/Pisces/issues/14#issuecomment-399756862
            if min_af < (1.0 / 100.0):
                cmd += " --minbasecallquality 30"
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(raw_file, paired.tumor_data["config"],
                                 prep_cmd="sed 's#%s.bam#%s#' | %s" %
                                 (base_out_name, dd.get_sample_name(paired.tumor_data),
                                  vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
Beispiel #18
0
def _produce_compatible_vcf(out_file, data):
    """Create a compatible VCF that downstream tools can deal with.

    - htsjdk and thus GATK and Picard do not support VCF4.3:
      https://github.com/broadinstitute/gatk/issues/2092
    - Use octopus legacy format to avoid incompatibilities.
      https://github.com/luntergroup/octopus#output-format
    - Fixes `##contig` lines since octopus only writes contigs
      used in the BED file region, causing incompatibilies with
      GatherVcfs when merging
    """
    base, ext = utils.splitext_plus(out_file)
    legacy_file = "%s.legacy%s" % (base, ext)
    final_file = "%s.vcf.gz" % base
    cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat"
    contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data),
                                                 out_file)
    cmd = (
        "{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | "
        "{contig_cl} | bgzip -c > {final_file}")
    do.run(cmd.format(**locals()),
           "Produce compatible VCF output file from octopus")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #19
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        """| %s -c 'from bcbio.variation import freebayes; """
                        """freebayes.call_somatic("%s", "%s")' """ %
                        (sys.executable, paired.tumor_name,
                         paired.normal_name))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "| {contig_cl} {freq_filter} "
                    "| bcftools filter -i 'QUAL >= 0' "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    return out_file
Beispiel #20
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {contig_cl} | bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    return out_file
Beispiel #21
0
def _run_scalpel_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(
                    _scalpel_options_from_config(items, config, out_file,
                                                 region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = (
                    "{perl_exports} && "
                    "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}"
                )
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(
                _scalpel_bed_file_opts(items, config, out_file, region,
                                       tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path,
                                                "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(
                    tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config,
                                      scalpel_tmp_file) as tx_indel_file:
                    cmd = (
                        "{perl_exports} && "
                        "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                        "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                        "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()),
                           "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(
                os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression(
                "reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            add_contig = vcfutils.add_contig_to_header_cl(items[0])
            cl2 = (
                "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                " {fix_ambig} | {vcfstreamsort} | {add_contig} {compress_cmd} > {tx_out_file}"
            )
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    return out_file
Beispiel #22
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(
                vrs, region, out_file, items=items, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                lowfreq_filter = _lowfreq_linear_filter(0, False)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| teststrandbias.R "
                       "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} "
                       "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    return out_file
Beispiel #23
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs, region,
                                                   out_file, items=items, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      """| %s -c 'from bcbio.variation import freebayes; """
                                      """freebayes.call_somatic("%s", "%s")' """
                                      % (sys.executable, paired.tumor_name, paired.normal_name))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' "
                                   "| %s "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                    _lowfreq_linear_filter(0, True),
                                    os.path.join(os.path.dirname(sys.executable), "py"),
                                    0, bam.aligner_from_header(paired.tumor_bam)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| awk 'NF>=48' | testsomatic.R "
                       "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       "| {contig_cl} {freq_filter} "
                       "| bcftools filter -i 'QUAL >= 0' "
                       "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file