def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data) filter_vcf = _filter_by_support(gt_vcf, data) data["sv"].append({"variantcaller": "lumpy", "vrn_file": filter_vcf, "exclude_file": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants( data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently not used as new versions of delly do not get good sensitivity with downsampled BAMs. """ sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) ds_bam = bam.downsample( dd.get_align_bam(data), data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently does not downsample as new versions do not get good sensitivity with downsampled BAMs. """ full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) return [full_bam] ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "lumpy")) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] sample_config_file = _write_samples_to_ids(pebed_file, items) lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items) for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = tz.get_in(["rgnames", "sample"], data) sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data) if lumpy_vcf: sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample) sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]), sample_bedpe, data) else: sample_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": sample_vcf, "exclude_file": exclude_file, "bedpe_file": sample_bedpe, "sample_bed": sample_config_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _run(in_file, work_dir, data): dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) return _run_svtyper(in_file, dedup_bam, sr_bam, call.get("exclude_file"), data)