Esempio n. 1
0
def _run_snpeff(snp_in, out_format, data):
    """Run effects prediction with snpEff, skipping if snpEff database not present.
    """
    snpeff_db, datadir = get_db(data)
    if not snpeff_db:
        return None, None

    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    stats_file = "%s-stats.html" % utils.splitext_plus(out_file)[0]
    csv_file = "%s-stats.csv" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(data, out_file) as tx_out_file:
            snpeff_cmd = _get_snpeff_cmd("eff", datadir, data, tx_out_file)
            cmd = (
                "{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} "
                "-csvStats {csv_file} -s {stats_file} {snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file, [stats_file, csv_file]
Esempio n. 2
0
def _run_snpeff(snp_in, out_format, data):
    snpeff_db, datadir = get_db(data)
    assert datadir is not None, "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"]
    assert os.path.exists(os.path.join(datadir, snpeff_db)), "Did not find %s snpEff genome data in %s" % (
        snpeff_db,
        datadir,
    )
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} "
                "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 3
0
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    if config is None:
        config = {}
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if out_dir:
        remove_orig = False
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if (not utils.file_exists(out_file) or not os.path.lexists(out_file)
          or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        assert os.path.exists(in_file), "Input file %s not found" % in_file
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(config, out_file) as tx_out_file:
                bgzip = tools.get_bgzip_cmd(config)
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                if prep_cmd:
                    prep_cmd = "| %s " % prep_cmd
                cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}"
                try:
                    do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
                except subprocess.CalledProcessError:
                    # Race conditions: ignore errors where file has been deleted by another
                    if os.path.exists(in_file) and not os.path.exists(out_file):
                        raise
            if remove_orig:
                try:
                    os.remove(in_file)
                except OSError:  # Handle cases where run in parallel and file has been deleted
                    pass
    tabix_index(out_file, config, tabix_args=tabix_args)
    return out_file
Esempio n. 4
0
def bgzip_and_index(in_file, config, remove_orig=True, prep_cmd=""):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file) or not os.path.lexists(out_file):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            if prep_cmd:
                cmd = "cat {in_file} | {prep_cmd} | {bgzip} -c > {tx_out_file}"
            else:
                cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            try:
                do.run(cmd.format(**locals()),
                       "bgzip %s" % os.path.basename(in_file))
            except subprocess.CalledProcessError:
                # Race conditions: ignore errors where file has been deleted by another
                if os.path.exists(in_file) and not os.path.exists(out_file):
                    raise
        if remove_orig:
            try:
                os.remove(in_file)
            except OSError:  # Handle cases where run in parallel and file has been deleted
                pass
    tabix_index(out_file, config)
    return out_file
Esempio n. 5
0
def _run_snpeff(snp_in, out_format, data):
    """Run effects prediction with snpEff, skipping if snpEff database not present.
    """
    snpeff_db, datadir = get_db(data)
    if not snpeff_db:
        return None

    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} "
                   "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 6
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip,
                needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(
        work_dir,
        os.path.basename(in_file) +
        (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            assert needs_bgzip
            bgzip = tools.get_bgzip_cmd(config)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            do.run(
                "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(
                    **locals()), "bgzip input file")
    return out_file
Esempio n. 7
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = in_file.startswith(utils.SUPPORTED_REMOTES)
            in_file = utils.remote_cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                do.run("cat {in_file} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Esempio n. 8
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip and not needs_convert:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if needs_convert else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Esempio n. 9
0
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    if config is None:
        config = {}
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if out_dir:
        remove_orig = False
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if (not utils.file_exists(out_file) or not os.path.lexists(out_file)
          or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        assert os.path.exists(in_file), "Input file %s not found" % in_file
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(config, out_file) as tx_out_file:
                bgzip = tools.get_bgzip_cmd(config)
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                if prep_cmd:
                    prep_cmd = "| %s " % prep_cmd
                cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}"
                try:
                    do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
                except subprocess.CalledProcessError:
                    # Race conditions: ignore errors where file has been deleted by another
                    if os.path.exists(in_file) and not os.path.exists(out_file):
                        raise
            if remove_orig:
                try:
                    os.remove(in_file)
                except OSError:  # Handle cases where run in parallel and file has been deleted
                    pass
    tabix_index(out_file, config, tabix_args=tabix_args)
    return out_file
Esempio n. 10
0
def bgzip_and_index(in_file, config, remove_orig=True, prep_cmd=""):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file) or not os.path.lexists(out_file):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            if prep_cmd:
                cmd = "cat {in_file} | {prep_cmd} | {bgzip} -c > {tx_out_file}"
            else:
                cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            try:
                do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
            except subprocess.CalledProcessError:
                # Race conditions: ignore errors where file has been deleted by another
                if os.path.exists(in_file) and not os.path.exists(out_file):
                    raise
        if remove_orig:
            try:
                os.remove(in_file)
            except OSError:  # Handle cases where run in parallel and file has been deleted
                pass
    tabix_index(out_file, config)
    return out_file
Esempio n. 11
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, config, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
Esempio n. 12
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join([str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
Esempio n. 13
0
def bgzip_and_index(in_file, config):
    """bgzip and tabix index an input VCF file.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
        os.remove(in_file)
    tabix_index(out_file, config)
    return out_file
Esempio n. 14
0
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip,
                needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.

    Handles cases where finput might be multiple files and need to be concatenated.
    """
    if isinstance(finput, six.string_types):
        in_file = finput
    else:
        assert not needs_convert, "Do not yet handle quality conversion with multiple inputs"
        return _bgzip_multiple_files(finput, work_dir, data)
    out_file = os.path.join(
        work_dir,
        os.path.basename(in_file).replace(".bz2", "") +
        (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file,
                                           unpack=needs_gunzip or needs_convert
                                           or needs_bgzip
                                           or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run(
                    "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".
                    format(**locals()), "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert
                                         or dd.get_trim_ends(data)) else ""
                do.run(
                    "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()),
                    "Get remote input")
            else:
                raise ValueError(
                    "Unexpected inputs: %s %s %s %s" %
                    (in_file, needs_bgzip, needs_gunzip, needs_convert))
    return out_file
Esempio n. 15
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
Esempio n. 16
0
def bgzip_and_index(in_file, config):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            try:
                do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
            except subprocess.CalledProcessError:
                # Race conditions: ignore errors where file has been deleted by another
                if os.path.exists(in_file) and not os.path.exists(out_file):
                    raise
        try:
            os.remove(in_file)
        except OSError:  # Handle cases where run in parallel and file has been deleted
            pass
    tabix_index(out_file, config)
    return out_file
Esempio n. 17
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            assert needs_bgzip
            bgzip = tools.get_bgzip_cmd(config)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                   "bgzip input file")
    return out_file
Esempio n. 18
0
def bgzip_and_index(in_file, config):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            try:
                do.run(cmd.format(**locals()),
                       "bgzip %s" % os.path.basename(in_file))
            except subprocess.CalledProcessError:
                # Race conditions: ignore errors where file has been deleted by another
                if os.path.exists(in_file) and not os.path.exists(out_file):
                    raise
        try:
            os.remove(in_file)
        except OSError:  # Handle cases where run in parallel and file has been deleted
            pass
    tabix_index(out_file, config)
    return out_file
Esempio n. 19
0
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.

    Handles cases where finput might be multiple files and need to be concatenated.
    """
    if isinstance(finput, six.string_types):
        in_file = finput
    else:
        assert not needs_convert, "Do not yet handle quality conversion with multiple inputs"
        return _bgzip_multiple_files(finput, work_dir, data)
    out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or
                                           needs_bgzip or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Esempio n. 20
0
def _run_snpeff(snp_in, out_format, data):
    snpeff_db, datadir = get_db(data)
    assert datadir is not None, \
        "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"]
    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} "
                   "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file