def vc_output_record(samples): """Prepare output record from variant calling to feed into downstream analysis. Prep work handles reformatting so we return generated dictionaries. For any shared keys that are calculated only once for a batch, like variant calls for the batch, we assign to every sample. """ shared_keys = [["vrn_file"], ["validate", "summary"], ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]] raw = cwlutils.samples_to_records( [utils.to_single_data(x) for x in samples]) shared = {} for key in shared_keys: cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x])) if len(cur) > 0: assert len(cur) == 1, (key, cur) shared[tuple(key)] = cur[0] else: shared[tuple(key)] = None out = [] for d in raw: for key, val in shared.items(): d = tz.update_in(d, key, lambda x: val) out.append([d]) return out
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. If doing joint calling, with `tools_on: [gvcf]`, split the sample into individuals instead of combining into a batch. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) batches = [] for cur_group in batch_groups.values(): joint_calling = any([is_joint(d) for d in cur_group]) if joint_calling: for d in cur_group: batches.append([d]) else: batches.append(cur_group) return batches + extras
def vc_output_record(samples): """Prepare output record from variant calling to feed into downstream analysis. Prep work handles reformatting so we return generated dictionaries. For any shared keys that are calculated only once for a batch, like variant calls for the batch, we assign to every sample. """ shared_keys = [["vrn_file"], ["validate", "summary"], ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]] raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples]) shared = {} for key in shared_keys: cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x])) if len(cur) > 0: assert len(cur) == 1, (key, cur) shared[tuple(key)] = cur[0] else: shared[tuple(key)] = None out = [] for d in raw: for key, val in shared.items(): d = tz.update_in(d, key, lambda x: val) out.append([d]) return out
def qc_to_rec(samples): """CWL: Convert a set of input samples into records for parallelization. """ samples = [utils.to_single_data(x) for x in samples] samples = cwlutils.assign_complex_to_samples(samples) to_analyze, extras = _split_samples_by_qc(samples) recs = cwlutils.samples_to_records([utils.to_single_data(x) for x in to_analyze + extras]) return [[x] for x in recs]
def batch_for_sv(samples): """Prepare a set of samples for parallel structural variant calling. CWL input target -- groups samples into batches and structural variant callers for parallel processing. """ to_process, extras, background = _batch_split_by_sv(samples, "standard") out = [cwlutils.samples_to_records(xs) for xs in to_process.values()] + extras return out
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a rtg SDF format file with build in indexes for retrieving sections of files. Retains back compatibility with bgzip/grabix approach. """ data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] approach = "grabix" if _has_grabix_indices( data) else dd.get_align_prep_method(data) data["files_orig"] = data["files"] if approach == "rtg": data["files"] = [rtg.to_sdf(data["files"], data)] else: data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): if approach == "rtg": splits = rtg.calculate_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) else: splits = _find_read_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] for data in cwlutils.samples_to_records(to_process): vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a bgzip and grabix indexed file for retrieving sections of files. """ from bcbio.pipeline import sample data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it if dd.get_umi_type(data) == "dragen": return [[data]] data["files_orig"] = data["files"] data["files"] = prep_fastq_inputs(data["files"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" # Handle any necessary trimming data = utils.to_single_data(sample.trim_sample(data)[0]) _prep_grabix_indexes(data["files"], data) data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits( data["files"][0], int(data["config"]["algorithm"]["align_split_size"])) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a bgzip and grabix indexed file for retrieving sections of files. """ from bcbio.pipeline import sample data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] data["files_orig"] = data["files"] data["files"] = _prep_fastq_inputs(data["files"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" # Handle any necessary trimming data = utils.to_single_data(sample.trim_sample(data)[0]) _prep_grabix_indexes(data["files"], data) data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def qc_to_rec(samples): """CWL: Convert a set of input samples into records for parallelization. """ samples = [utils.to_single_data(x) for x in samples] return [[x] for x in cwlutils.samples_to_records(samples)]