def _has_ensemble(data): # for tumour-normal calling, a sample may have "ensemble" for the normal # sample configured but there won't be any variant files per se variants_to_process = (len(data["variants"]) > 1 and any([x.get('vrn_file', None) is not None or x.get('vrn_file_batch', None) is not None for x in data["variants"]])) return variants_to_process and dd.get_ensemble(data)
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any( [dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) return checkpoints
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir( os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in( ["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [ normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir( os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files) ] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index( callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get( "validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf( out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = { "variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None } if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]