def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a bgzip and grabix indexed file for retrieving sections of files. """ from bcbio.pipeline import sample data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] data["files_orig"] = data["files"] data["files"] = _prep_fastq_inputs(data["files"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" # Handle any necessary trimming data = utils.to_single_data(sample.trim_sample(data)[0]) _prep_grabix_indexes(data["files"], data) data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def trim_sample(*args): return sample.trim_sample(*args)
def trim_sample(*args): return sample.trim_sample(*args)