def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in( ["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends( data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join( work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) utils.symlink_plus(in_file, out_file) return out_file
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. """ out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] if isinstance(in_file, (list, tuple)): in_file = in_file[0] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file) or (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)): out_file = _bgzip_file(data["in_file"], data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) out_file = _symlink_or_copy_grabix(in_file, out_file, data) return out_file
def _ready_gzip_fastq(in_files, data): """Check if we have gzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim)
def _ready_gzip_fastq(in_files, data, require_bgzip=False): """Check if we have gzipped fastq and don't need format conversion or splitting. Avoid forcing bgzip if we don't need indexed files. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if require_bgzip and all_gzipped: all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = dd.get_align_split_size(data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
def _ready_bgzip_fastq(in_files, data): """Check if we have bgzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if all_gzipped: all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) else: all_bgzipped = False needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim)
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0): """Provide a commandline for prep of fastq inputs with seqtk. Handles fast conversion of fastq quality scores and trimming. """ needs_convert = dd.get_quality_format(data).lower() == "illumina" trim_ends = dd.get_trim_ends(data) seqtk = config_utils.get_program("seqtk", data["config"]) if in_file: in_file = objectstore.cl_input(in_file) else: in_file = "/dev/stdin" cmd = "" if needs_convert: cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals()) if trim_ends: left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4] if left_trim or right_trim: trim_infile = "/dev/stdin" if needs_convert else in_file pipe = " | " if needs_convert else "" cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals()) return cmd