def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [], "prioritize": { "tsv": [], "raw": [] } }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) # Standard callers for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) # prioritization for pdata in _group_by_sample(items): prioritysv = [ x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get( "sv", []) if x["variantcaller"] == "sv-prioritize" ] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend( prioritysv[0]["raw_files"].values()) return [out]
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [] }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate")] out = {"sv": {"calls": [], "supplemental": [], "prioritize": {"tsv": [], "raw": []}}, "svvalidate": vcvalidate.combine_validations(items, "svvalidate")} added = set([]) # Standard callers for data in items: if data.get("sv"): if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] cur_name = _useful_basename(data) if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) out["sv"]["calls"].append(out_file) if data["sv"].get("supplemental"): out["sv"]["supplemental"].extend([x for x in data["sv"]["supplemental"] if x]) # prioritization for pdata in _group_by_sample(items): prioritysv = [x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get("sv", []) if x["variantcaller"] == "sv-prioritize"] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend(prioritysv[0]["raw_files"].values()) return [out]