def split_somatic(items): """Split somatic batches, adding a germline target. Enables separate germline calling of samples using shared alignments. """ somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items) # extract germline samples to run from normals in tumor/normal pairs germline_added = set([]) germline = [] for somatic_group in somatic_groups: paired = vcfutils.get_paired(somatic_group) if paired and paired.normal_data: cur = utils.deepish_copy(paired.normal_data) vc = dd.get_variantcaller(cur) if isinstance(vc, dict) and "germline" in vc: cur["description"] = "%s-germline" % cur["description"] if cur["description"] not in germline_added: germline_added.add(cur["description"]) cur["rgnames"]["sample"] = cur["description"] del cur["metadata"]["batch"] cur["metadata"]["phenotype"] = "germline" cur = remove_align_qc_tools(cur) cur["config"]["algorithm"]["variantcaller"] = vc[ "germline"] germline.append(cur) # Fix variantcalling specification for only somatic targets somatic_out = [] for data in somatic: vc = dd.get_variantcaller(data) if isinstance(vc, dict) and "somatic" in vc: data["config"]["algorithm"]["variantcaller"] = vc["somatic"] somatic_out.append(data) return non_somatic + somatic_out + germline
def split_somatic(items): """Split somatic batches, adding a germline target. Enables separate germline calling of samples using shared alignments. """ items = [_clean_flat_variantcaller(x) for x in items] somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items) # extract germline samples to run from normals in tumor/normal pairs germline_added = set([]) germline = [] for somatic_group in somatic_groups: paired = vcfutils.get_paired(somatic_group) if paired and paired.normal_data: cur = utils.deepish_copy(paired.normal_data) vc = dd.get_variantcaller(cur) if isinstance(vc, dict) and "germline" in vc: if cur["description"] not in germline_added: germline_added.add(cur["description"]) cur["rgnames"]["sample"] = cur["description"] cur["metadata"]["batch"] = "%s-germline" % cur["description"] cur["metadata"]["phenotype"] = "germline" cur = remove_align_qc_tools(cur) cur["config"]["algorithm"]["variantcaller"] = vc["germline"] germline.append(cur) # Fix variantcalling specification for only somatic targets somatic_out = [] for data in somatic: vc = dd.get_variantcaller(data) if isinstance(vc, dict) and "somatic" in vc: data["config"]["algorithm"]["variantcaller"] = vc["somatic"] somatic_out.append(data) return non_somatic + somatic_out + germline
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"][ "jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (dd.get_batches(data)[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def batch(samples): """CWL: batch together per sample, joint and germline calls for ensemble combination. Sets up groups of same sample/batch variant calls for ensemble calling, as long as we have more than one caller per group. """ samples = [utils.to_single_data(x) for x in samples] sample_order = [dd.get_sample_name(x) for x in samples] batch_groups = collections.defaultdict(list) for data in samples: batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)])) batch_groups[(batch_samples, dd.get_phenotype(data))].append(data) out = [] for (batch_samples, phenotype), gsamples in batch_groups.items(): if len(gsamples) > 1: batches = set([]) for d in gsamples: batches |= set(dd.get_batches(d)) cur = copy.deepcopy(gsamples[0]) cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples), "batch_samples": batch_samples, "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples], "calls": [d.get("vrn_file") for d in gsamples]}}) out.append(cur) def by_original_order(d): return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order]) return sorted(out, key=by_original_order)
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def _check_for_problem_somatic_batches(items, config): """Identify problem batch setups for somatic calling. We do not support multiple tumors in a single batch and VarDict(Java) does not handle pooled calling, only tumor/normal. """ to_check = [] for data in items: data = copy.deepcopy(data) data["config"] = config_utils.update_w_custom(config, data) to_check.append(data) data_by_batches = collections.defaultdict(list) for data in to_check: batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) for batch, items in data_by_batches.items(): if vcfutils.get_paired(items): vcfutils.check_paired_problems(items) elif len(items) > 1: vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items]))) if any(x.lower().startswith("vardict") for x in vcs): raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items])) elif any(x.lower() == "mutect" for x in vcs): raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items]))
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def _needs_java(data): """Check if a caller needs external java for MuTect. No longer check for older GATK (<3.6) versions because of time cost; this won't be relevant to most runs so we skip the sanity check. """ vc = dd.get_variantcaller(data) if isinstance(vc, dict): out = {} for k, v in vc.items(): if not isinstance(v, (list, tuple)): v = [v] out[k] = v vc = out elif not isinstance(vc, (list, tuple)): vc = [vc] if "mutect" in vc or ("somatic" in vc and "mutect" in vc["somatic"]): return True if "gatk" in vc or "gatk-haplotype" in vc or ( "germline" in vc and "gatk-haplotype" in vc["germline"]): pass # runner = broad.runner_from_config(data["config"]) # version = runner.get_gatk_version() # if LooseVersion(version) < LooseVersion("3.6"): # return True return False
def _needs_java(data): """Check if a caller needs external java for MuTect. No longer check for older GATK (<3.6) versions because of time cost; this won't be relevant to most runs so we skip the sanity check. """ vc = dd.get_variantcaller(data) if isinstance(vc, dict): out = {} for k, v in vc.items(): if not isinstance(v, (list, tuple)): v = [v] out[k] = v vc = out elif not isinstance(vc, (list, tuple)): vc = [vc] if "mutect" in vc or ("somatic" in vc and "mutect" in vc["somatic"]): return True if "gatk" in vc or "gatk-haplotype" in vc or ("germline" in vc and "gatk-haplotype" in vc["germline"]): pass # runner = broad.runner_from_config(data["config"]) # version = runner.get_gatk_version() # if LooseVersion(version) < LooseVersion("3.6"): # return True return False
def rnaseq_variant_calling(samples, run_parallel): """ run RNA-seq variant calling using GATK """ samples = run_parallel("run_rnaseq_variant_calling", samples) variantcaller = dd.get_variantcaller(to_single_data(samples[0])) if variantcaller and ("gatk-haplotype" in variantcaller): out = [] for d in joint.square_off(samples, run_parallel): out.extend( [[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))]) samples = out if variantcaller: samples = run_parallel("run_rnaseq_ann_filter", samples) if variantcaller and ("gatk-haplotype" in variantcaller): out = [] for data in (to_single_data(xs) for xs in samples): if "variants" not in data: data["variants"] = [] data["variants"].append({ "variantcaller": "gatk-haplotype", "vcf": data["vrn_file_orig"], "population": { "vcf": data["vrn_file"] } }) data["vrn_file"] = data.pop("vrn_file_orig") out.append([data]) samples = out return samples
def _rnaseq_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["rnaseq"] = True checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) return checkpoints
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [ utils.to_single_data(x) for x in validate.summarize_grading(items) ] out = {"validate": items[0]["validate"], "variants": {"calls": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] cur_name = "%s-%s" % (names[0], dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", "calls")), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data["vrn_file"]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"]["calls"].append(out_file) return [out]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def rnaseq_variant_calling(samples, run_parallel): """ run RNA-seq variant calling using GATK """ samples = run_parallel("run_rnaseq_variant_calling", samples) variantcaller = dd.get_variantcaller(to_single_data(samples[0])) if variantcaller and ("gatk-haplotype" in variantcaller): samples = joint.square_off(samples, run_parallel) samples = run_parallel("run_rnaseq_ann_filter", samples) return samples
def _get_jvm_opts(data, out_file): """Retrieve JVM options when running the Java version of VarDict. """ if dd.get_variantcaller(data).endswith("-java"): resources = config_utils.get_resources("vardict", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return "export VAR_DICT_OPTS='%s' && " % " ".join(jvm_opts) else: return ""
def _default_conf_files(data, retriever): conf_files = [] if dd.get_variantcaller(data) or dd.get_vrn_file(data): if annotate_gemini(data, retriever): conf_files.append("gemini") if _annotate_somatic(data, retriever): conf_files.append("somatic") if dd.get_analysis(data).lower().find("rna-seq") >= 0: conf_files.append("rnaedit") return conf_files
def _get_jvm_opts(data, out_file): """Retrieve JVM options when running the Java version of VarDict. """ if not dd.get_variantcaller(data).endswith("-perl"): resources = config_utils.get_resources("vardict", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return "export VAR_DICT_OPTS='%s' && " % " ".join(jvm_opts) else: return ""
def get_somatic_variantcallers(items): """Retrieve all variant callers for somatic calling, handling somatic/germline. """ out = [] for data in items: vcs = dd.get_variantcaller(data) if isinstance(vcs, dict) and "somatic" in vcs: vcs = vcs["somatic"] if not isinstance(vcs, (list, tuple)): vcs = [vcs] out += vcs return set(vcs)
def get_type(data): """Retrieve the type of effects calculation to do. """ if data["analysis"].lower().startswith("var") or dd.get_variantcaller(data): etype = tz.get_in(("config", "algorithm", "effects"), data, "snpeff") if isinstance(etype, (list, tuple)): if len(etype) == 1: return etype[0] else: raise ValueError("Unexpected variant effect type for %s: %s" % (dd.get_sample_name(data), etype)) else: return etype
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) return [[data]]
def rnaseq_variant_calling(samples, run_parallel): """ run RNA-seq variant calling using GATK """ samples = run_parallel("run_rnaseq_variant_calling", samples) variantcaller = dd.get_variantcaller(to_single_data(samples[0])) if variantcaller and ("gatk-haplotype" in variantcaller): out = [] for d in joint.square_off(samples, run_parallel): out.extend([[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))]) samples = out samples = run_parallel("run_rnaseq_ann_filter", samples) return samples
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = dd.get_variantcaller(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] jvm_opts = _get_jvm_opts(items[0], tx_out_file) cmd = ("{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def run_rnaseq_variant_calling(data): variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) return [[data]]
def get_type(data): """Retrieve the type of effects calculation to do. """ if data["analysis"].lower().startswith("var") or dd.get_variantcaller( data): etype = tz.get_in(("config", "algorithm", "effects"), data, "snpeff") if isinstance(etype, (list, tuple)): if len(etype) == 1: return etype[0] else: raise ValueError("Unexpected variant effect type for %s: %s" % (dd.get_sample_name(data), etype)) else: return etype
def _needs_java(data): """Check if a caller needs external java for MuTect or older GATK 3.6. """ vc = dd.get_variantcaller(data) if not isinstance(vc, (list, tuple)): vc = [vc] if "mutect" in vc: return True if "gatk" in vc or "gatk-haplotype" in vc: runner = broad.runner_from_config(data["config"]) version = runner.get_gatk_version() if LooseVersion(version) < LooseVersion("3.6"): return True return False
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) ref_file = dd.get_ref_file(data) out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf") if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, out_file) updated_samples.append([data]) return updated_samples return samples
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return batch_groups.values()
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_file = os.path.join(utils.safe_makedir(os.path.join("variation", "rnaseq", "gatk-haplotype")), "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, out_file=out_file) return dd.set_vrn_file(data, out_file)
def batch_for_jointvc(items): batch_groups = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in items]: vc = dd.get_variantcaller(data) if genotype.is_joint(data): batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] else: batches = [dd.get_sample_name(data)] for b in batches: data = utils.deepish_copy(data) data["vrn_file_gvcf"] = data["vrn_file"] batch_groups[(b, vc)].append(data) return list(batch_groups.values())
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = dd.get_variantcaller(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" else: somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) jvm_opts = _get_jvm_opts(items[0], tx_out_file) cmd = ("{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def get_active_vcinfo(data): """Use first caller if ensemble is not active """ callers = dd.get_variantcaller(data) if not callers: return None if isinstance(callers, basestring): callers = [callers] active_vs = [] if "variants" in data: for v in data["variants"]: if v.get("variantcaller") == "ensemble": return v if v.get("vrn_file"): active_vs.append(v) if len(active_vs) > 0: return active_vs[0]
def get_vardict_command(data): """ convert variantcaller specification to proper vardict command, handling string or list specification """ vcaller = dd.get_variantcaller(data) if isinstance(vcaller, list): vardict = [x for x in vcaller if "vardict" in x] if not vardict: return None vardict = vardict[0] elif not vcaller: return None else: vardict = vcaller vardict = "vardict-java" if not vardict.endswith("-perl") else "vardict" return vardict
def _callable_from_gvcf(data, vrn_file, out_dir): """Retrieve callable regions based on ref call regions in gVCF. Uses https://github.com/lijiayong/gvcf_regions """ methods = {"freebayes": "freebayes", "platypus": "platypus", "gatk-haplotype": "gatk"} gvcf_type = methods.get(dd.get_variantcaller(data)) if gvcf_type: out_file = os.path.join(out_dir, "%s-gcvf-coverage.bed" % utils.splitext_plus(os.path.basename(vrn_file))[0]) if not utils.file_uptodate(out_file, vrn_file): with file_transaction(data, out_file) as tx_out_file: cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} " "| bedtools merge > {tx_out_file}") do.run(cmd.format(**locals()), "Convert gVCF to BED file of callable regions") return out_file
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _clean_flat_variantcaller(data): """Convert flattened dictionary from CWL representation into dictionary. CWL flattens somatic/germline tags into a set of strings, which we reconstitute as a dictionary. """ vc = dd.get_variantcaller(data) if isinstance(vc, (list, tuple)) and all([x.count(":") == 1 for x in vc]): out = {} for v in vc: k, v = v.split(":") if k in out: out[k].append(v) else: out[k] = [v] data = dd.set_variantcaller(data, out) return data
def _get_active_vcinfo(data): """Use first caller if ensemble is not active """ callers = dd.get_variantcaller(data) if not callers: return None if isinstance(callers, basestring): callers = [callers] active_vs = [] if "variants" in data: for v in data["variants"]: if v.get("variantcaller") == "ensemble": return v if v.get("vrn_file"): active_vs.append(v) if len(active_vs) > 0: return active_vs[0]
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def _get_variant_callers(data): """Use first caller if ensemble is not active""" callers = dd.get_variantcaller(data) if not callers: return None if isinstance(callers, basestring): callers = [callers] active_callers = [c.get("variantcaller") for c in data.get("variants", [{}])] active_vcf = [c.get("vrn_file") for c in data.get("variants", [{}])] active_germline = [c.get("germline") for c in data.get("variants", [{}])] vcf = dict(zip(active_callers, active_vcf)) germline = dict(zip(active_callers, active_germline)) if "ensemble" in active_callers: vcf_fn = vcf["ensemble"] else: vcf_fn = vcf[callers[0]] if not vcf_fn: vcf_fn = germline[callers[0]] return vcf_fn
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) vrn_file = dd.get_vrn_file(data) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]