def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = [ "DEL", "DUP" ] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product( sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", dd.get_sample_name(items[0]), "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) exclude_file = _get_full_exclude_file(items, work_bams, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, ref_file, work_dir, items) for chrom in sshared.get_sv_chroms(items, exclude_file)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) final_vcf = sshared.finalize_sv(combo_vcf, data, items) if final_vcf: delly_vcf = _delly_count_evidence_filter(final_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "exclude": exclude_file }) upload_counts[final_vcf] += 1 out.append(data) return out
def _combine_out_files(chr_files, work_dir, data): """Concatenate all CNV calls into a single file. """ out_file = "%s.bed" % sshared.outname_from_inputs(chr_files) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for chr_file in chr_files: with open(chr_file) as in_handle: is_empty = in_handle.readline().startswith("track name=empty") if not is_empty: with open(chr_file) as in_handle: shutil.copyfileobj(in_handle, out_handle) return out_file
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = ["DEL", "DUP"] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", dd.get_sample_name(items[0]), "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) exclude_file = _get_full_exclude_file(items, work_bams, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, ref_file, work_dir, items) for chrom in sshared.get_sv_chroms(items, exclude_file)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) final_vcf = sshared.finalize_sv(combo_vcf, data, items) if final_vcf: delly_vcf = _delly_count_evidence_filter(final_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "exclude": exclude_file}) upload_counts[final_vcf] += 1 out.append(data) return out