def _set_align_split_size(data): """Set useful align_split_size, generating an estimate if it doesn't exist. We try to split on larger inputs and avoid too many pieces, aiming for size chunks of 5Gb or at most 50 maximum splits. The size estimate used in calculations is 20 million reads for ~5Gb. For UMI calculations we skip splitting since we're going to align and re-align after consensus. """ target_size = 5 # Gb target_size_reads = 20 # million reads max_splits = 100 # Avoid too many pieces, causing merge memory problems val = dd.get_align_split_size(data) umi_consensus = dd.get_umi_consensus(data) if val is None: if not umi_consensus: total_size = 0 # Gb # Use original files if we might have reduced the size of our prepped files input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", []) for fname in input_files: if os.path.exists(fname): total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0) # Only set if we have files and are bigger than the target size if total_size > target_size: data["config"]["algorithm"]["align_split_size"] = \ int(1e6 * _pick_align_split_size(total_size, target_size, target_size_reads, max_splits)) elif val: assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val return data
def _ready_gzip_fastq(in_files, data, require_bgzip=False): """Check if we have gzipped fastq and don't need format conversion or splitting. Avoid forcing bgzip if we don't need indexed files. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if require_bgzip and all_gzipped: all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = dd.get_align_split_size(data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def _prep_grabix_indexes(in_files, data): """Parallel preparation of grabix indexes for files. """ # if we have gzipped but not bgzipped, add a fake index for CWL support # Also skips bgzip indexing if we don't need alignment splitting if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True) or dd.get_align_split_size(data) is False): for in_file in in_files: with file_transaction(data, in_file + ".gbi") as tx_gbi_file: with open(tx_gbi_file, "w") as out_handle: out_handle.write("Not grabix indexed; index added for compatibility.\n") else: items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x] run_multicore(_grabix_index, items, data["config"]) return data
def _prep_grabix_indexes(in_files, data): """Parallel preparation of grabix indexes for files. """ # if we have gzipped but not bgzipped, add a fake index for CWL support # Also skips bgzip indexing if we don't need alignment splitting if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True) or dd.get_align_split_size(data) is False): for in_file in in_files: if not utils.file_exists(in_file + ".gbi"): with file_transaction(data, in_file + ".gbi") as tx_gbi_file: with open(tx_gbi_file, "w") as out_handle: out_handle.write("Not grabix indexed; index added for compatibility.\n") else: items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x] run_multicore(_grabix_index, items, data["config"]) return data
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _set_align_split_size(data): """Set useful align_split_size, generating an estimate if it doesn't exist. We try to split on larger inputs and avoid too many pieces, aiming for size chunks of 5Gb or at most 50 maximum splits. The size estimate used in calculations is 20 million reads for ~5Gb. For UMI calculations we skip splitting since we're going to align and re-align after consensus. For CWL runs, we pick larger split sizes to avoid overhead of staging each chunk. """ if cwlutils.is_cwl_run(data): target_size = 20 # Gb target_size_reads = 80 # million reads else: target_size = 5 # Gb target_size_reads = 20 # million reads max_splits = 100 # Avoid too many pieces, causing merge memory problems val = dd.get_align_split_size(data) umi_consensus = dd.get_umi_consensus(data) if val is None: if not umi_consensus: total_size = 0 # Gb # Use original files if we might have reduced the size of our prepped files input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", []) for fname in input_files: if os.path.exists(fname): total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0) # Only set if we have files and are bigger than the target size if total_size > target_size: data["config"]["algorithm"]["align_split_size"] = \ int(1e6 * _pick_align_split_size(total_size, target_size, target_size_reads, max_splits)) elif val: assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val return data