def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ from bcbio.structural import cnvkit items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out = [] for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: if cnvkit.use_general_sv_bins(data): if dd.get_background_cnv_reference(data): target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data), cnv_group.work_dir, data) else: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn, cnv_group.work_dir, data) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i)} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def _calculate_sv_bins_gatk(data, cnv_group, size_calc_fn): """Calculate structural variant bins using GATK4 CNV callers region or even intervals approach. """ if dd.get_background_cnv_reference(data, "gatk-cnv"): target_bed = gatkcnv.pon_to_bed(dd.get_background_cnv_reference(data, "gatk-cnv"), cnv_group.work_dir, data) else: target_bed = gatkcnv.prepare_intervals(data, cnv_group.region_file, cnv_group.work_dir) gc_annotated_tsv = gatkcnv.annotate_intervals(target_bed, data) return target_bed, None, gc_annotated_tsv
def _calculate_sv_bins_cnvkit(data, cnv_group, size_calc_fn): """Calculate structural variant bins using target/anti-target approach from CNVkit. """ from bcbio.structural import cnvkit if dd.get_background_cnv_reference(data, "cnvkit"): target_bed, anti_bed = cnvkit.targets_from_background(dd.get_background_cnv_reference(data, "cnvkit"), cnv_group.work_dir, data) else: target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn, cnv_group.work_dir, data) return target_bed, anti_bed, None
def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage using panel of normals with GATK's de-noise approaches. """ input_backs = set( filter( lambda x: x is not None, [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) pon = list(input_backs)[0] elif backgrounds: pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir) else: pon = None for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) denoise_file = gatkcnv.denoise(data, pon, work_dir) out_files[dd.get_sample_name(data)] = denoise_file back_files[dd.get_sample_name(data)] = pon return back_files, out_files
def run(items): """Normalization and log2 ratio calculation plus CNV calling for full cohort. - Combine coverage of each region for each sample - Prepare read counts for each sample - Normalize coverages in cohort by gene and sample, and calculate log2 ratios - Call amplifications and deletions """ items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) input_backs = list( set( filter( lambda x: x is not None, [dd.get_background_cnv_reference(d, "seq2c") for d in items]))) coverage_file = _combine_coverages(items, work_dir, input_backs) read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs) normal_names = [] if input_backs: with open(input_backs[0]) as in_handle: for line in in_handle: if len(line.split()) == 2: normal_names.append(line.split()[0]) normal_names += [ dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1 ] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file) return items
def run(items): """Normalization and log2 ratio calculation plus CNV calling for full cohort. - Combine coverage of each region for each sample - Prepare read counts for each sample - Normalize coverages in cohort by gene and sample, and calculate log2 ratios - Call amplifications and deletions """ items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) input_backs = list(set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "seq2c") for d in items]))) coverage_file = _combine_coverages(items, work_dir, input_backs) read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs) normal_names = [] if input_backs: with open(input_backs[0]) as in_handle: for line in in_handle: if len(line.split()) == 2: normal_names.append(line.split()[0]) normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file) return items
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set( filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ if dd.get_background_cnv_reference(data, "seq2c"): bed_file = _background_to_bed( dd.get_background_cnv_reference(data, "seq2c"), data) else: bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError( "BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ if dd.get_background_cnv_reference(data, "seq2c"): bed_file = _background_to_bed(dd.get_background_cnv_reference(data, "seq2c"), data) else: bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _normalize_sv_coverage_gatk(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage using panel of normals with GATK's de-noise approaches. """ input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "gatk-cnv") for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) pon = list(input_backs)[0] elif backgrounds: pon = gatkcnv.create_panel_of_normals(backgrounds, group_id, work_dir) else: pon = None for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) denoise_file = gatkcnv.denoise(data, pon, work_dir) out_files[dd.get_sample_name(data)] = denoise_file back_files[dd.get_sample_name(data)] = pon return back_files, out_files
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared orig_items = items items = [ utils.to_single_data(x) for x in cwlutils.handle_combined_input(items) ] if all(not cnvkit.use_general_sv_bins(x) for x in items): return orig_items out_files = {} back_files = {} for group_id, gitems in itertools.groupby( items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join( dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set( filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = cnvkit.run_fix( tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))), data) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[ dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name( data)] out.append([data]) return out