Esempio n. 1
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired)
            params += _add_region_params(region, out_file, items)
            params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner = broad.runner_from_config(items[0]["config"])
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)))
            pp_cmd = _post_process_cl(paired)
            cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")
    out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Esempio n. 2
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Esempio n. 3
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using CNVkit.
    """
    if not background: background = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        background_bams = [paired.normal_bam]
        background_names = [paired.normal_name]
    else:
        inputs = [items]
        background_bams = [x["align_bam"] for x in background]
        background_names = [dd.get_sample_name(x) for x in background]
    orig_vcf_file = _run_wham(inputs, background_bams)
    wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs)
    vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names)
    bed_file = _convert_to_bed(vcf_file, inputs)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({
            "variantcaller": "wham",
            "vrn_file": vcf_file,
            "bed_file": bed_file
        })
        out.append(data)
    return out
Esempio n. 4
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": sample_vcf})
        out.append(data)
    return out
Esempio n. 5
0
def _mutect_call_prep(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    # if coverage_depth_max is not given, default to 10000
    downsample_cov = get_in(paired.tumor_config,
                            ("algorithm", "coverage_depth_max"), 10000)
    # if coverage_depth_max is zero, default to Broad default value (currently 1500)
    params += ["--downsample_to_coverage",
               max(1500, downsample_cov)] if downsample_cov > 0 else []
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Esempio n. 6
0
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Preparation work for SomaticIndelDetector.
    """
    base_config = items[0]["config"]
    for x in align_bams:
        bam.index(x, base_config)

    params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"]
    # Limit per base read start count to between 200-10000, i.e. from any base
    # can no more 10000 new reads begin.
    # Further, limit maxNumberOfReads accordingly, otherwise SID discards
    # windows for high coverage panels.
    window_size = 200  # default SID value
    paired = vcfutils.get_paired_bams(align_bams, items)
    max_depth = min(max(200, get_in(paired.tumor_config,
                                    ("algorithm", "coverage_depth_max"), 10000)), 10000)
    params += ["--downsample_to_coverage", max_depth]
    params += ["--maxNumberOfReads", str(int(max_depth) * window_size)]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        # notice there must be at least 4 reads of coverage in normal
        params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af]
    else:
        params += ["--unpaired"]
        params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    return params
Esempio n. 7
0
def estimate(items, batch, config):
    """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel.
    """
    hetcallers = {
        "theta": theta.run,
        "phylowgs": phylowgs.run,
        "bubbletree": bubbletree.run
    }
    paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items],
                                      items)
    calls = _get_calls(paired.tumor_data)
    variants = get_variants(paired.tumor_data)
    het_info = []
    for hetcaller in _get_hetcallers(items):
        try:
            hetfn = hetcallers[hetcaller]
        except KeyError:
            hetfn = None
            print("%s not yet implemented" % hetcaller)
        if hetfn:
            hetout = hetfn(variants[0], calls, paired)
            if hetout:
                het_info.append(hetout)
    out = []
    for data in items:
        if batch == _get_batches(data)[0]:
            if dd.get_sample_name(data) == paired.tumor_name:
                if het_info:
                    data["heterogeneity"] = het_info
            out.append([data])
    return out
Esempio n. 8
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 9
0
def _paired_load_script(work_bams, names, chrom, pairmode, items):
    """Prepare BAMs for assessing CNVs in a paired tumor/normal setup.
    """
    paired = vcfutils.get_paired_bams(work_bams, items)
    return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                               ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                               num_cores=0, chrom=chrom, pairmode=pairmode)
Esempio n. 10
0
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Preparation work for SomaticIndelDetector.
    """
    base_config = items[0]["config"]
    for x in align_bams:
        bam.index(x, base_config)

    params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"]
    # Limit per base read start count to between 200-10000, i.e. from any base
    # can no more 10000 new reads begin.
    # Further, limit maxNumberOfReads accordingly, otherwise SID discards
    # windows for high coverage panels.
    window_size = 200  # default SID value
    paired = vcfutils.get_paired_bams(align_bams, items)
    max_depth = min(max(200, get_in(paired.tumor_config,
                                    ("algorithm", "coverage_depth_max"), 10000)), 10000)
    params += ["--downsample_to_coverage", max_depth]
    params += ["--maxNumberOfReads", str(int(max_depth) * window_size)]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        # notice there must be at least 4 reads of coverage in normal
        params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af]
    else:
        params += ["--unpaired"]
        params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    return params
Esempio n. 11
0
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's TNhaplotyper (MuTect2 like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper"
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else ""
            license = license_export(items[0])
            tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file)
            cores = dd.get_num_cores(items[0])
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                   "--algo TNhaplotyper "
                   "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                   "{dbsnp} {cosmic} {tx_orig_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper")
            cmd = ("gunzip -c {tx_orig_file} | "
                   "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | "
                   "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | "
                   "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | "
                   "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | "
                   "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible")
            vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])
    return out_file
Esempio n. 12
0
def estimate(items, batch, config):
    """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel.
    """
    hetcallers = {"theta": theta.run,
                  "phylowgs": phylowgs.run,
                  "bubbletree": bubbletree.run}
    paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items)
    calls = _get_calls(paired.tumor_data)
    variants = _get_variants(paired.tumor_data)
    het_info = []
    for hetcaller in _get_hetcallers(items):
        try:
            hetfn = hetcallers[hetcaller]
        except KeyError:
            hetfn = None
            print "%s not yet implemented" % hetcaller
        if hetfn:
            hetout = hetfn(variants[0], calls, paired)
            if hetout:
                het_info.append(hetout)
    out = []
    for data in items:
        if batch == _get_batches(data)[0]:
            if dd.get_sample_name(data) == paired.tumor_name:
                if het_info:
                    data["heterogeneity"] = het_info
            out.append([data])
    return out
Esempio n. 13
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        filter_vcf = _filter_by_support(gt_vcf, data)
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": filter_vcf,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Esempio n. 14
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            opts += " -f {}".format(ref_file)
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = ("{freebayes} --pooled-discrete --pvar 0.7"
                  " --genotype-qualities {opts} {paired.tumor_bam}"
                  " {paired.normal_bam} | {vcfsamplediff} -s VT"
                  " {paired.normal_name} {paired.tumor_name}"
                  " - {compress_cmd} >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
Esempio n. 15
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        background_bams = [paired.normal_bam]
        background_names = [paired.normal_name]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
        background_names = [dd.get_sample_name(x) for x in background]
    orig_vcf_file = _run_wham(inputs, background_bams)
    wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs)
    vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names)
    bed_file = _convert_to_bed(vcf_file, inputs, use_lrt=len(background_bams) > 0)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": _subset_to_sample(bed_file, data),
                           "vcf_file": vcf_file})
        out.append(data)
    return out
Esempio n. 16
0
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    return out_file
Esempio n. 17
0
def run_qsnp(align_bams,
             items,
             ref_file,
             assoc_files,
             region=None,
             out_file=None):
    """Run qSNP calling on paired tumor/normal.
    """
    if utils.file_exists(out_file):
        return out_file
    paired = get_paired_bams(align_bams, items)
    if paired.normal_bam:
        region_files = []
        regions = _clean_regions(items, region)
        if regions:
            for region in regions:
                out_region_file = out_file.replace(".vcf.gz",
                                                   _to_str(region) + ".vcf.gz")
                region_file = _run_qsnp_paired(align_bams, items, ref_file,
                                               assoc_files, region,
                                               out_region_file)
                region_files.append(region_file)
            out_file = combine_variant_files(region_files, out_file, ref_file,
                                             items[0]["config"])
        if not region:
            out_file = _run_qsnp_paired(align_bams, items, ref_file,
                                        assoc_files, region, out_file)
        return out_file
    else:
        raise ValueError("qSNP only works on paired samples")
Esempio n. 18
0
def run_freebayes(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run FreeBayes variant calling, either paired tumor/normal or germline calling.
    """
    if is_paired_analysis(align_bams, items):
        paired = get_paired_bams(align_bams, items)
        if not paired.normal_bam:
            call_file = _run_freebayes_caller(align_bams,
                                              items,
                                              ref_file,
                                              assoc_files,
                                              region,
                                              out_file,
                                              somatic=paired)
        else:
            call_file = _run_freebayes_paired(align_bams, items, ref_file,
                                              assoc_files, region, out_file)
    else:
        vcfutils.check_paired_problems(items)
        call_file = _run_freebayes_caller(align_bams, items, ref_file,
                                          assoc_files, region, out_file)

    return call_file
Esempio n. 19
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0],
                                       dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data),
                                            sample_vcf, data["config"])
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
        data["sv"].append({
            "variantcaller": "wham",
            "vrn_file": effects_vcf or sample_vcf
        })
        out.append(data)
    return out
Esempio n. 20
0
def run(items, background=None):
    """Detect copy number variations from tumor/normal samples using Battenberg.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if not paired or not paired.normal_bam:
        logger.warn(
            "Battenberg only works on paired tumor/normal inputs, skipping %s"
            % dd.get_sample_name(items[0]))
        batout = None
    elif not tz.get_in(["genome_resources", "aliases", "human"],
                       paired.tumor_data):
        logger.warn("Battenberg only works on human data, skipping %s" %
                    dd.get_sample_name(items[0]))
        batout = None
    else:
        batout = _do_run(paired)
        batout["variantcaller"] = "battenberg"
    out = []
    for data in items:
        if batout:
            if "sv" not in data:
                data["sv"] = []
            data["sv"].append(batout)
        out.append(data)
    return out
Esempio n. 21
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--downsample_to_coverage", max(200, get_in(paired.tumor_config,
                                                           ("algorithm", "coverage_depth_max"), 10000))]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Esempio n. 22
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError(
                    "Require both tumor and normal BAM files for FreeBayes cancer calling"
                )

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(
                _freebayes_options_from_config(items, config, out_file,
                                               region))
            opts += " -f {}".format(ref_file)
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(
                    utils.get_in(paired.tumor_config,
                                 ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one

            # NOTE: -s in vcfsamplediff (strict checking: i.e., require no
            # reads in the germline to call somatic) is not used as it is
            # too stringent
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = (
                "{freebayes} --pooled-discrete --genotype-qualities "
                "{opts} {paired.tumor_bam} {paired.normal_bam} "
                "| {vcffilter} -f 'QUAL > 1' -s "
                "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - "
                "{compress_cmd} >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()),
                   "Genotyping paired variants with FreeBayes", {})
    fix_somatic_calls(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
Esempio n. 23
0
def run_tnscope(align_bams,
                items,
                ref_file,
                assoc_files,
                region=None,
                out_file=None):
    """Call variants with Sentieon's TNscope somatic caller.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(
            dd.get_variant_regions(items[0]), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope"
            dbsnp = "--dbsnp %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(items[0])
            cores = dd.get_num_cores(items[0])
            cmd = (
                "{license}sentieon driver -t {cores} -r {ref_file} "
                "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                "--algo TNscope "
                "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                "{dbsnp} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNscope")
    return out_file
Esempio n. 24
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = (
                    "{freebayes} -f {ref_file} {opts} "
                    "{paired.tumor_bam} {paired.normal_bam} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                    "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 25
0
def _ready_for_het_analysis(items):
    """Check if a sample has input information for heterogeneity analysis.

    We currently require a tumor/normal sample containing both CNV and variant calls.
    """
    paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items)
    if paired and paired.normal_bam:
        return _get_variants(paired.tumor_data) and _get_cnvs(paired.tumor_data)
Esempio n. 26
0
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R",
                ref_file, "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            for a in annotation.get_gatk_annotations(
                    items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(
                    tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(
                    tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file,
                                             tx_raw_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 27
0
def _run_cnvkit_cancer(items, background, access_file, work_dir):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    return _run_cnvkit_shared(items[0], [paired.tumor_bam],
                              [paired.normal_bam],
                              access_file,
                              work_dir,
                              background_name=paired.normal_name)
Esempio n. 28
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Esempio n. 29
0
def _run_cnvkit_cancer(items, background, access_file):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(items[0])
    ckout = _run_cnvkit_shared(items[0], [paired.tumor_bam], [paired.normal_bam],
                               access_file, work_dir, background_name=paired.normal_name)
    ckout = theta.run(ckout, paired)
    return _associate_cnvkit_out(ckout, items)
Esempio n. 30
0
def _run_cnvkit_cancer(items, background, access_file):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(items[0])
    ckout = _run_cnvkit_shared(items[0], [paired.tumor_bam], [paired.normal_bam],
                               access_file, work_dir, background_name=paired.normal_name)
    ckout = theta.run(ckout, paired)
    return _associate_cnvkit_out(ckout, items)
Esempio n. 31
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Esempio n. 32
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(
            data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file,
                                  data)
        gt_vcf = vcfutils.combine_variant_files(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            ref_file=dd.get_ref_file(data),
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Esempio n. 33
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = config_utils.get_program("vardict", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            vcfallelicprimitives = config_utils.get_program(
                "vcfallelicprimitives", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_somatic.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(
                utils.get_in(config,
                             ("algorithm", "min_allele_fraction"), 10)) / 100.0
            opts = " ".join(
                _vardict_options_from_config(items, config, out_file, region))
            coverage_interval = utils.get_in(
                config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = (
                "{vardict} -G {ref_file} -f {freq} "
                "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                "| {strandbias} "
                "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()),
                   "Genotyping with VarDict: Inference", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 34
0
def _ready_for_het_analysis(items):
    """Check if a sample has input information for heterogeneity analysis.

    We currently require a tumor/normal sample containing both CNV and variant calls.
    """
    paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items],
                                      items)
    has_het = any(dd.get_hetcaller(d) for d in items)
    if has_het and paired:
        return get_variants(paired.tumor_data) and _get_calls(
            paired.tumor_data, cnv_only=True)
Esempio n. 35
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name]
    ckouts = _run_cnvkit_shared([paired.tumor_data], normal_data)
    if not ckouts:
        return items
    assert len(ckouts) == 1
    tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data], is_somatic=True)
    return tumor_data + normal_data
Esempio n. 36
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                return _run_freebayes_caller(align_bams, items, ref_file,
                                             assoc_files, region, out_file)
                #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                  "min_allele_fraction"), 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            opts += " --min-repeat-entropy 1 --experimental-gls"
            # Recommended settings for cancer calling
            # https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
            opts += " --pooled-discrete --genotype-qualities --report-genotype-likelihood-max"
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one
            # NOTE: -s in vcfsamplediff (strict checking: i.e., require no
            # reads in the germline to call somatic) is not used as it is
            # too stringent
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = ("{freebayes} -f {ref_file} {opts} "
                  "{paired.tumor_bam} {paired.normal_bam} "
                  "| {vcffilter} -f 'QUAL > 5' -s "
                  "| {vcfallelicprimitives} | {vcfstreamsort} "
                  "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - "
                  "{compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    fix_somatic_calls(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 37
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name]
    ckouts = _run_cnvkit_shared([paired.tumor_data], normal_data)
    if not ckouts:
        return items
    assert len(ckouts) == 1
    tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data], is_somatic=True)
    return tumor_data + normal_data
Esempio n. 38
0
def _paired_load_script(work_bams, names, chrom, pairmode, items):
    """Prepare BAMs for assessing CNVs in a paired tumor/normal setup.
    """
    paired = vcfutils.get_paired_bams(work_bams, items)
    return _paired_prep.format(case_file=paired.tumor_bam,
                               case_name=paired.tumor_name,
                               ctrl_file=paired.normal_bam,
                               ctrl_name=paired.normal_name,
                               num_cores=0,
                               chrom=chrom,
                               pairmode=pairmode)
Esempio n. 39
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run octopus variant calling, handling both somatic and germline calling.
    """
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        regions = _get_regions(region, out_file, items)
        if paired:
            return _run_somatic(paired, ref_file, regions, out_file)
        else:
            return _run_germline(align_bams, items, ref_file, regions,
                                 out_file)
    return out_file
Esempio n. 40
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run strelka2 variant calling, either paired tumor/normal or germline calling.
    """
    if vcfutils.is_paired_analysis(align_bams, items):
        paired = vcfutils.get_paired_bams(align_bams, items)
        assert paired.normal_bam, "Strelka2 requires a normal sample"
        call_file = _run_somatic(paired, ref_file, assoc_files, region,
                                 out_file)
    else:
        call_file = _run_germline(align_bams, items, ref_file, assoc_files,
                                  region, out_file)
    return call_file
Esempio n. 41
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples. Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                return _run_freebayes_caller(align_bams, items, ref_file,
                                             assoc_files, region, out_file)
                #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                  "min_allele_fraction"), 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            opts += " --min-repeat-entropy 1 --experimental-gls"
            # Recommended settings for cancer calling
            opts += (" --pooled-discrete --pooled-continuous --genotype-qualities "
                     "--report-genotype-likelihood-max --allele-balance-priors-off")
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cl = ("{freebayes} -f {ref_file} {opts} "
                  "{paired.tumor_bam} {paired.normal_bam} "
                  "| vcffilter -f 'QUAL > 5' -s "
                  "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                  "| {fix_ambig} | vcfallelicprimitives --keep-info --keep-geno "
                  "| vt normalize -q -r {ref_file} - "
                  "{compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 42
0
def _paired_load_script(work_bams, names, chrom, pairmode, items):
    """Prepare BAMs for assessing CNVs in a paired tumor/normal setup.
    """
    paired = vcfutils.get_paired_bams(work_bams, items)
    bed_file = _get_regional_bed_file(items[0])
    if bed_file:
        return _paired_prep_targeted.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                            ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                            num_cores=0, chrom=chrom, pairmode=pairmode, bed_file=bed_file)
    else:
        return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                   ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                   num_cores=0, chrom=chrom, pairmode=pairmode)
Esempio n. 43
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run strelka2 variant calling, either paired tumor/normal or germline calling.

    region can be a single region or list of multiple regions for multicore calling.
    """
    if vcfutils.is_paired_analysis(align_bams, items):
        paired = vcfutils.get_paired_bams(align_bams, items)
        assert paired.normal_bam, "Strelka2 requires a normal sample"
        call_file = _run_somatic(paired, ref_file, assoc_files, region, out_file)
    else:
        call_file = _run_germline(align_bams, items, ref_file,
                                  assoc_files, region, out_file)
    return call_file
Esempio n. 44
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run octopus variant calling, handling both somatic and germline calling.
    """
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        vrs = bedutils.population_variant_regions(items)
        target = shared.subset_variant_regions(vrs, region,
                                               out_file, items=items, do_merge=True)
        if paired:
            return _run_somatic(paired, ref_file, target, out_file)
        else:
            return _run_germline(align_bams, items, ref_file, target, out_file)
    return out_file
Esempio n. 45
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data)
    ckouts = _run_cnvkit_shared([paired.tumor_data], [paired.tumor_bam], [paired.normal_bam],
                               work_dir, background_name=paired.normal_name)
    if not ckouts:
        return items
    assert len(ckouts) == 1
    tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data])
    normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name]
    return tumor_data + normal_data
Esempio n. 46
0
def _paired_load_script(work_bams, names, chrom, pairmode, items):
    """Prepare BAMs for assessing CNVs in a paired tumor/normal setup.
    """
    paired = vcfutils.get_paired_bams(work_bams, items)
    bed_file = _get_regional_bed_file(items[0])
    if bed_file:
        return _paired_prep_targeted.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                            ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                            num_cores=0, chrom=chrom, pairmode=pairmode, bed_file=bed_file)
    else:
        return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                   ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                   num_cores=0, chrom=chrom, pairmode=pairmode)
Esempio n. 47
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data)
    access_file = _create_access_file(dd.get_ref_file(paired.tumor_data), work_dir, paired.tumor_data)
    ckout = _run_cnvkit_shared(paired.tumor_data, [paired.tumor_bam], [paired.normal_bam],
                               access_file, work_dir, background_name=paired.normal_name)
    # Skip THetA runs until we can speed up data preparation steps
    # ckout = theta.run(ckout, paired)
    tumor_data = _associate_cnvkit_out(ckout, [paired.tumor_data])
    normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name]
    return tumor_data + normal_data
Esempio n. 48
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data)
    access_file = _create_access_file(dd.get_ref_file(paired.tumor_data), work_dir, paired.tumor_data)
    ckout = _run_cnvkit_shared(paired.tumor_data, [paired.tumor_bam], [paired.normal_bam],
                               access_file, work_dir, background_name=paired.normal_name)
    if not ckout:
        return items

    tumor_data = _associate_cnvkit_out(ckout, [paired.tumor_data])
    normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name]
    return tumor_data + normal_data
Esempio n. 49
0
def _paired_load_script(work_bams, names, chrom, pairmode, items):
    """Prepare BAMs for assessing CNVs in a paired tumor/normal setup.
    """
    paired = vcfutils.get_paired_bams(work_bams, items)
    bed_file = items[0]["config"]["algorithm"].get("variant_regions", None)
    is_genome = items[0]["config"]["algorithm"].get("coverage_interval", "exome").lower() in ["genome"]
    if utils.file_exists(bed_file) and not is_genome:
        return _paired_prep_targeted.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                            ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                            num_cores=0, chrom=chrom, pairmode=pairmode, bed_file=bed_file)
    else:
        return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name,
                                   ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name,
                                   num_cores=0, chrom=chrom, pairmode=pairmode)
Esempio n. 50
0
def run_varscan(align_bams, items, ref_file, assoc_files,
                region=None, out_file=None):
    paired = get_paired_bams(align_bams, items)
    if paired and paired.normal_bam and paired.tumor_bam:
        call_file = samtools.shared_variantcall(_varscan_paired, "varscan",
                                                align_bams, ref_file, items,
                                                assoc_files, region, out_file)
    else:
        vcfutils.check_paired_problems(items)
        call_file = samtools.shared_variantcall(_varscan_work, "varscan",
                                                align_bams, ref_file,
                                                items, assoc_files,
                                                region, out_file)
    return call_file
Esempio n. 51
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--reference", ref_file]
            else:
                params += ["-R", ref_file]
            for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 52
0
def estimate(items, batch, config):
    """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel.

    XXX In progress, currently uses THetA but not yet turned on
    """
    paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items)
    cnvs = _get_cnvs(paired.tumor_data)
    new_cnvs = theta.run(cnvs[0], paired)
    print(new_cnvs)

    out = []
    for data in items:
        if batch == _get_batches(data)[0]:
            out.append([data])
    return out
Esempio n. 53
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run strelka2 variant calling, either paired tumor/normal or germline calling.

    region can be a single region or list of multiple regions for multicore calling.
    """
    call_file = "%s-raw.vcf.gz" % utils.splitext_plus(out_file)[0]
    strelka_work_dir = "%s-work" % utils.splitext_plus(out_file)[0]
    paired = vcfutils.get_paired_bams(align_bams, items)
    if paired:
        assert paired.normal_bam, "Strelka2 requires a normal sample"
        call_file = _run_somatic(paired, ref_file, assoc_files, region, call_file, strelka_work_dir)
    else:
        call_file = _run_germline(align_bams, items, ref_file,
                                  assoc_files, region, call_file, strelka_work_dir)
    return _af_annotate_and_filter(paired, items, call_file, out_file)
Esempio n. 54
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run strelka2 variant calling, either paired tumor/normal or germline calling.

    region can be a single region or list of multiple regions for multicore calling.
    """
    call_file = "%s-raw.vcf.gz" % utils.splitext_plus(out_file)[0]
    strelka_work_dir = "%s-work" % utils.splitext_plus(out_file)[0]
    paired = vcfutils.get_paired_bams(align_bams, items)
    if paired:
        assert paired.normal_bam, "Strelka2 requires a normal sample"
        call_file = _run_somatic(paired, ref_file, assoc_files, region, call_file, strelka_work_dir)
    else:
        call_file = _run_germline(align_bams, items, ref_file,
                                  assoc_files, region, call_file, strelka_work_dir)
    return _af_annotate_and_filter(paired, items, call_file, out_file)
Esempio n. 55
0
def _run_tumor_pindel_caller(align_bams,
                             items,
                             ref_file,
                             assoc_files,
                             region=None,
                             out_file=None):
    """Detect indels with pindel in tumor/[normal] analysis.
    Only attempts to detect small insertion/deletions and not larger structural events.
    :param align_bam: (list) bam files
    :param items: (dict) information from yaml
    :param ref_file: (str) genome in fasta format
    :param assoc_file: (dict) files for annotation
    :param region: (str or tupple) region to analyze
    :param out_file: (str) final vcf file
    :returns: (str) final vcf file
    """
    config = items[0]["config"]
    paired = get_paired_bams(align_bams, items)
    if out_file is None:
        out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0]
    paired_bam = [paired.tumor_bam]
    paired_name = [paired.tumor_name]
    if paired.normal_bam:
        paired_bam.append(paired.normal_bam)
        paired_name.append(paired.normal_name)
    if not utils.file_exists(out_file):
        with tx_tmpdir(config) as tmp_path:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            root_pindel = os.path.join(tmp_path, "pindelroot")
            pindel = config_utils.get_program("pindel", config)
            opts = _pindel_options(items, config, out_file, region, tmp_path)
            tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path,
                                          config)
            cmd = (
                "{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " +
                "{opts} --report_inversions false --report_duplications false "
                "--report_long_insertions false --report_breakpoints false "
                "--report_interchromosomal_events false "
                "--max_range_index 2")
            do.run(cmd.format(**locals()), "Genotyping with pindel", {})
            out_file = _create_vcf(root_pindel, out_file, ref_file, items,
                                   paired)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Esempio n. 56
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = config_utils.get_program("vardict", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_paired.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
            # merge bed file regions as amplicon VarDict is only supported in single sample mode
            opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True))
            coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                   for data in items):
                somatic_filter = ""
            else:
                somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                                  os.path.join(os.path.dirname(sys.executable), "py"))
            cmd = ("{vardict} -G {ref_file} -f {freq} "
                   "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                   "| {strandbias} "
                   "| {var2vcf} -M -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                   "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 57
0
def _run_cnvkit_cancer(items, background):
    """Run CNVkit on a tumor/normal pair.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data)
    ckouts = _run_cnvkit_shared([paired.tumor_data], [paired.tumor_bam],
                                [paired.normal_bam],
                                work_dir,
                                background_name=paired.normal_name)
    if not ckouts:
        return items
    assert len(ckouts) == 1
    tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data])
    normal_data = [
        x for x in items if dd.get_sample_name(x) != paired.tumor_name
    ]
    return tumor_data + normal_data
Esempio n. 58
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only smCounter2 calling.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, (
        "smCounter2 supports tumor-only variant calling: %s" %
        (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs,
                                           region,
                                           out_file,
                                           items=items,
                                           do_merge=True)
    out_file = out_file.replace(".vcf.gz", ".vcf")
    out_prefix = utils.splitext_plus(os.path.basename(out_file))[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = [
                "smCounter2", "--runPath",
                os.path.dirname(tx_out_file), "--outPrefix", out_prefix,
                "--bedTarget", target, "--refGenome", ref_file, "--bamFile",
                paired.tumor_bam, "--bamType", "consensus", "--nCPU",
                dd.get_num_cores(paired.tumor_data)
            ]
            do.run(cmd, "smcounter2 variant calling")
            for fname in glob.glob(
                    os.path.join(os.path.dirname(tx_out_file),
                                 "*.smCounter*")):
                shutil.move(
                    fname,
                    os.path.join(os.path.dirname(out_file),
                                 os.path.basename(fname)))
            utils.symlink_plus(
                os.path.join(os.path.dirname(out_file),
                             "%s.smCounter.cut.vcf" % out_prefix), out_file)
    return vcfutils.bgzip_and_index(
        out_file,
        paired.tumor_data["config"],
        remove_orig=False,
        prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" %
        (out_prefix, dd.get_sample_name(paired.tumor_data),
         vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data),
                                          out_file)))