def _run_snpeff(snp_in, out_format, data): """Run effects prediction with snpEff, skipping if snpEff database not present. """ snpeff_db, datadir = get_db(data) if not snpeff_db: return None, None assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) stats_file = "%s-stats.html" % utils.splitext_plus(out_file)[0] csv_file = "%s-stats.csv" % utils.splitext_plus(out_file)[0] if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(data, out_file) as tx_out_file: snpeff_cmd = _get_snpeff_cmd("eff", datadir, data, tx_out_file) cmd = ( "{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} " "-csvStats {csv_file} -s {stats_file} {snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}" ) do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file, [stats_file, csv_file]
def _run_snpeff(snp_in, out_format, data): snpeff_db, datadir = get_db(data) assert datadir is not None, "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"] assert os.path.exists(os.path.join(datadir, snpeff_db)), "Did not find %s snpEff genome data in %s" % ( snpeff_db, datadir, ) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(out_file) as tx_out_file: cmd = ( "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}" ) do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None): """bgzip and tabix index an input file, handling VCF and BED. """ if config is None: config = {} out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if out_dir: remove_orig = False out_file = os.path.join(out_dir, os.path.basename(out_file)) if (not utils.file_exists(out_file) or not os.path.lexists(out_file) or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file assert os.path.exists(in_file), "Input file %s not found" % in_file if not utils.file_uptodate(out_file, in_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" if prep_cmd: prep_cmd = "| %s " % prep_cmd cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config, tabix_args=tabix_args) return out_file
def bgzip_and_index(in_file, config, remove_orig=True, prep_cmd=""): """bgzip and tabix index an input file, handling VCF and BED. """ out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if not utils.file_exists(out_file) or not os.path.lexists(out_file): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) if prep_cmd: cmd = "cat {in_file} | {prep_cmd} | {bgzip} -c > {tx_out_file}" else: cmd = "{bgzip} -c {in_file} > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config) return out_file
def _run_snpeff(snp_in, out_format, data): """Run effects prediction with snpEff, skipping if snpEff database not present. """ snpeff_db, datadir = get_db(data) if not snpeff_db: return None assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(data, out_file) as tx_out_file: cmd = ("{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join( work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: assert needs_bgzip bgzip = tools.get_bgzip_cmd(config) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file do.run( "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format( **locals()), "bgzip input file") return out_file
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = in_file.startswith(utils.SUPPORTED_REMOTES) in_file = utils.remote_cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: do.run("cat {in_file} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip and not needs_convert: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if needs_convert else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, config, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join([str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
def bgzip_and_index(in_file, config): """bgzip and tabix index an input VCF file. """ out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cmd = "{bgzip} -c {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) os.remove(in_file) tabix_index(out_file, config) return out_file
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. Handles cases where finput might be multiple files and need to be concatenated. """ if isinstance(finput, six.string_types): in_file = finput else: assert not needs_convert, "Do not yet handle quality conversion with multiple inputs" return _bgzip_multiple_files(finput, work_dir, data) out_file = os.path.join( work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run( "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}". format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run( "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError( "Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def bgzip_and_index(in_file, config): """bgzip and tabix index an input file, handling VCF and BED. """ out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cmd = "{bgzip} -c {in_file} > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config) return out_file
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: assert needs_bgzip bgzip = tools.get_bgzip_cmd(config) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") return out_file
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. Handles cases where finput might be multiple files and need to be concatenated. """ if isinstance(finput, six.string_types): in_file = finput else: assert not needs_convert, "Do not yet handle quality conversion with multiple inputs" return _bgzip_multiple_files(finput, work_dir, data) out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _run_snpeff(snp_in, out_format, data): snpeff_db, datadir = get_db(data) assert datadir is not None, \ "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"] assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(out_file) as tx_out_file: cmd = ("{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file