Beispiel #1
0
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    cores = dd.get_num_cores(items[0])
    if cores and cores > 1:
        opts += ["-th", str(cores)]
    # Disable SV calling for vardict, causes issues with regional analysis
    # by detecting SVs outside of target regions, which messes up merging
    # SV calling will be worked on as a separate step
    vardict_cl = get_vardict_command(items[0])
    version = programs.get_version_manifest(vardict_cl)
    if (vardict_cl and version and
        ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or
         (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))):
        opts += ["--nosv"]
    if (vardict_cl and version and
         (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))):
        opts += ["--deldupvar"]
    # remove low mapping quality reads
    if not is_rnaseq:
        opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    return " ".join(opts), " ".join(var2vcf_opts)
Beispiel #2
0
def cpu_and_memory(programs, items):
    """Retrieve CPU and memory/core specified in configuration input.
    """
    assert len(items) > 0, "Finding job resources but no items to process"
    config = items[0]["config"]
    all_cores = []
    all_memory = []
    algs = [config_utils.get_algorithm_config(x) for x in items]
    progs = _get_resource_programs(programs, algs)
    # Calculate cores
    for prog in progs:
        resources = config_utils.get_resources(prog, config)
        all_cores.append(resources.get("cores", 1))
    if len(all_cores) == 0:
        all_cores.append(1)
    cores_per_job = max(all_cores)
    # Calculate memory. Use 1Gb memory usage per core as min baseline if not specified
    for prog in progs:
        resources = config_utils.get_resources(prog, config)
        memory = _get_prog_memory(resources, cores_per_job)
        if memory:
            all_memory.append(memory)
    if len(all_memory) == 0:
        all_memory.append(1)
    memory_per_core = max(all_memory)
    return cores_per_job, memory_per_core
Beispiel #3
0
def sort(in_bam, config, order="coordinate"):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            mem = resources.get("memory", "2G")
            samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} "
                            "{in_bam} {tx_sort_stem}")
            if sambamba:
                if tz.get_in(["resources", "sambamba"], config):
                    sm_resources = config_utils.get_resources("sambamba", config)
                    mem = sm_resources.get("memory", "2G")
                # sambamba uses total memory, not memory per core
                mem = config_utils.adjust_memory(mem, cores, "increase").upper()
                # Use samtools compatible natural sorting
                # https://github.com/lomereiter/sambamba/issues/132
                order_flag = "--natural-sort" if order == "queryname" else ""
                cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} "
                       "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}")
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(cmd.format(**locals()),
                       "Sort BAM file (multi core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
            except:
                logger.exception("Multi-core sorting failed, reverting to single core")
                resources = config_utils.get_resources("samtools", config)
                mem = resources.get("memory", "2G")
                cores = 1
                order_flag = "-n" if order == "queryname" else ""
                do.run(samtools_cmd.format(**locals()),
                       "Sort BAM file (single core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
    return sort_file
Beispiel #4
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += ["-T", "HaplotypeCaller",
                       "-o", tx_out_file,
                       "--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            if _joint_calling(items):  # Prepare gVCFs if doing joint calling
                params += ["--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR",
                           "--variant_index_parameter", "128000"]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Beispiel #5
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #6
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(os.path.dirname(out_file),
                                 "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s"
                % (replace_file, header_file), "Prepare header file for merging")
        resources = config_utils.get_resources("picard", config)
        ropts = []
        if "options" in resources:
            ropts += [str(x) for x in resources.get("options", [])]
        do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" %
               (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)),
               "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
Beispiel #7
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"), assoc_files.get("cosmic"),
                                   region, out_file)
        assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
            "Require full version of GATK 3.5+ for mutect2 calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += ["-T", "MuTect2",
                       "-o", tx_out_file,
                       "--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("mutect2")
            broad_runner.run_gatk(params)
    return out_file
Beispiel #8
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Beispiel #9
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Beispiel #10
0
def _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files, data):
    """Trimming with cutadapt, using version installed with bcbio-nextgen.
    """
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    if quality_format == "illumina":
        quality_base = "64"
    else:
        quality_base = "33"

    # --times=2 tries twice remove adapters which will allow things like:
    # realsequenceAAAAAAadapter to remove both the poly-A and the adapter
    # this behavior might not be what we want; we could also do two or
    # more passes of cutadapt
    cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
    adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
    ropts = " ".join(str(x) for x in
                     config_utils.get_resources("cutadapt", data["config"]).get("options", []))
    base_cmd = ("{cutadapt} {ropts} --times=2 --quality-base={quality_base} "
                "--quality-cutoff=5 --format=fastq "
                "{adapter_cmd} ").format(**locals())
    if len(fastq_files) == 2:
        # support for the single-command paired trimming introduced in
        # cutadapt 1.8
        adapter_cmd = adapter_cmd.replace("-a ", "-A ")
        base_cmd += "{adapter_cmd} ".format(adapter_cmd=adapter_cmd)
        return _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data)
    else:
        return _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data)
Beispiel #11
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Beispiel #12
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Beispiel #13
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired)
            params += _add_region_params(region, out_file, items)
            params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner = broad.runner_from_config(items[0]["config"])
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)))
            pp_cmd = _post_process_cl(paired)
            cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")
    out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Beispiel #14
0
def run(name, chip_bam, input_bam, genome_build, out_dir, config):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        return out_file
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(config_utils.get_resources("macs2", config).get("options", ""))
    if genome_build not in HS and options.find("-g") == -1:
        raise ValueError("This %s genome doesn't have a pre-set value."
                          "You can add specific values using resources "
                          "option for macs2 in the YAML file (-g genome_size)."
                          "Check Chip-seq configuration in "
                          "bcbio-nextgen documentation.")

    genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build]
    with utils.chdir(out_dir):
        cmd = _macs2_cmd()
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return out_file
Beispiel #15
0
def _bgzip_from_bam(bam_file, dirs, config):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default
    bgzip = _get_bgzip_cmd(config)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    if not utils.file_exists(out_file_1):
        with file_transaction(out_file_1) as tx_out_file:
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                   checks=[do.file_reasonable_size(tx_out_file, bam_file)])
    return [x for x in [out_file_1, out_file_2] if x is not None]
Beispiel #16
0
 def __init__(self, picard_ref, gatk_dir, config):
     resources = config_utils.get_resources("gatk", config)
     self._jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
     self._picard_ref = config_utils.expand_path(picard_ref)
     self._gatk_dir = config_utils.expand_path(gatk_dir) or config_utils.expand_path(picard_ref)
     self._config = config
     self._gatk_version, self._picard_version = None, None
Beispiel #17
0
def run(data):
    #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse = config_utils.get_program("oncofuse", config)

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = [oncofuse]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += [input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Beispiel #18
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                       "{fastq_file} {pair_file} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #19
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #20
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #21
0
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    return out_file
Beispiel #22
0
def _config_params(base_config, assoc_files, region, out_file):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = base_config["algorithm"].get("variant_regions")
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    return params
Beispiel #23
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.
    """
    opts = []
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")),
                                              items[0])
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts):
        # add minimum reportable allele frequency, for which FreeBayes defaults to 20
        min_af = float(utils.get_in(config, ("algorithm",
                                             "min_allele_fraction"), 20)) / 100.0
        opts += ["--min-alternate-fraction", str(min_af)]
    return opts
Beispiel #24
0
def _scalpel_options_from_config(items, config, out_file, region, tmp_path):
    opts = []
    opts += ["--format", "vcf", "--intarget"]  # output vcf, report only variants within bed regions
    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += ["--bed", target]
        else:
            tmp_bed = os.path.join(tmp_path, "tmp.bed")
            with file_transaction(tmp_bed) as tx_tmp_bed:
                if not isinstance(region, (list, tuple)):
                    message = ("Region must be a tuple - something odd just happened")
                    raise ValueError(message)
                chrom, start, end = region
                print("%s\t%s\t%s" % (chrom, start, end), file=tx_tmp_bed)
            opts += ["--bed", tmp_bed]
    resources = config_utils.get_resources("scalpel", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--outratio" not in " ".join(opts):
        # add minimum reportable allele frequency, for which Scalpel defaults to 5
        # but other somatic tools in bcbio default to 10
        min_af = float(utils.get_in(config, ("algorithm",
                                             "min_allele_fraction"), 10)) / 100.0
        opts += ["--outratio", str(min_af)]
    return opts
Beispiel #25
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file,
                                  todo="square"):
    """Run squaring or merging analysis using bcbio.variation.recall.
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall", data["config"])
    # adjust memory by cores but leave room for run program memory
    memcores = int(math.ceil(float(cores) / 5.0))
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]),
                                        {"algorithm": {"memory_adjust": {"direction": "increase",
                                                                         "magnitude": memcores}}})
    # Write unique VCFs and BAMs to input file
    input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0]
    with open(input_file, "w") as out_handle:
        out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n")
        if todo == "square":
            out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n")
    variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "")
    cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)]
    if todo == "square":
        cmd += ["--caller", variantcaller]
    cmd += [out_file, ref_file, input_file]
    do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)))
    return out_file
Beispiel #26
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #27
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Beispiel #28
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Beispiel #29
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Beispiel #30
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #31
0
def _apply_vqsr(in_file, ref_file, recal_file, tranch_file, sensitivity_cutoff,
                filter_type, data):
    """Apply VQSR based on the specified tranche, returning a filtered VCF file.
    """
    broad_runner = broad.runner_from_config(data["config"])
    base, ext = utils.splitext_plus(in_file)
    out_file = "{base}-{filter}filter{ext}".format(base=base,
                                                   ext=ext,
                                                   filter=filter_type)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "ApplyRecalibration", "-R", ref_file, "--input", in_file,
                "--out", tx_out_file, "--tranches_file", tranch_file,
                "--recal_file", recal_file, "--mode", filter_type
            ]
            resources = config_utils.get_resources("gatk_apply_recalibration",
                                                   data["config"])
            opts = resources.get("options", [])
            if not opts:
                opts += ["--ts_filter_level", sensitivity_cutoff]
            params += opts
            broad_runner.run_gatk(params)
    return out_file
Beispiel #32
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.
            ropts += [str(x) for x in resources.get("options", [])]

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = ["-T", "GenotypeGVCFs",
                      "--variant", "gendb://%s" % genomics_db,
                      "-R", dd.get_ref_file(data),
                      "--output", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            params += ["-ploidy", str(ploidy.get_ploidy([data], region))]
            # Avoid slow genotyping runtimes with improved quality score calculation in GATK4
            # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1
            params += ["--use-new-qual-calculator"]
            resources = config_utils.get_resources("gatk", data["config"])
            params += [str(x) for x in resources.get("options", [])]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #33
0
def _run_qsnp_paired(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(
                        resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file,
                                  assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()),
                           "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    return out_file
Beispiel #34
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
        #sailfish.sleuthify_sailfish(tx_out_dir)
    return out_file
Beispiel #35
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "magnitude": max(2, dd.get_cores(data))
                }
            }
        })
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(
        os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (
        utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
Beispiel #36
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
    out_s, out_p1, out_p2, out_o1, out_o2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                                          (base_name, rext, fext))
                                             for fext in ["s1", "p1", "p2", "o1", "o2"]]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \
             (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2):
            cram_file = objectstore.cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                   "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY "
                   "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} "
                   "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]
Beispiel #37
0
def _config_params(base_config, assoc_files, region, out_file, items):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += [
            "--minimum_mutation_cell_fraction",
            "%.2f" % (min_af / 100.0)
        ]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    # Output quality scores
    if "--enable_qscore_output" not in params:
        params.append("--enable_qscore_output")
    # drf not currently supported in MuTect to turn off duplicateread filter
    # params += gatk.standard_cl_params(items)
    return params
Beispiel #38
0
def _novoalign_args_from_config(config, need_quality=True):
    """Select novoalign options based on configuration parameters.
    """
    if need_quality:
        qual_format = config["algorithm"].get("quality_format", "").lower()
        qual_flags = ["-F", "ILMFQ" if qual_format == "illumina" else "STDFQ"]
    else:
        qual_flags = []
    multi_mappers = config["algorithm"].get("multiple_mappers")
    if multi_mappers is True:
        multi_flag = "Random"
    elif isinstance(multi_mappers, basestring):
        multi_flag = multi_mappers
    else:
        multi_flag = "None"
    multi_flags = ["-r"] + multi_flag.split()
    resources = config_utils.get_resources("novoalign", config)
    # default arguments for improved variant calling based on
    # comparisons to reference materials: turn off soft clipping and recalibrate
    if resources.get("options") is None:
        extra_args = ["-o", "FullNW", "-k"]
    else:
        extra_args = [str(x) for x in resources.get("options", [])]
    return qual_flags + multi_flags + extra_args
Beispiel #39
0
def _run_break_point_inspector(data, variant_file, paired):
    output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi")
    if not utils.file_exists(output_vcf):
        with file_transaction(data, output_vcf) as tx_output_vcf:
            cores = dd.get_num_cores(data)
            resources = config_utils.get_resources("break-point-inspector",
                                                   data["config"])
            memory = config_utils.adjust_opts(
                resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), {
                    "algorithm": {
                        "memory_adjust": {
                            "magnitude": cores,
                            "direction": "increase"
                        }
                    }
                })
            cmd = ["break-point-inspector"]
            cmd += memory
            cmd += ["-vcf", variant_file]
            if paired:
                cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam]
            cmd += ["-output_vcf", tx_output_vcf]
            do.run(cmd, "Running Break Point Inspector for Manta SV calls")
    return output_vcf
Beispiel #40
0
def _run_qualimap(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    if not os.path.exists(report_file):
        ds_bam = bam.downsample(bam_file, data, 1e7)
        bam_file = ds_bam if ds_bam else bam_file
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        resources = config_utils.get_resources("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem}")
        species = data["genome_resources"]["aliases"].get("ensembl", "").upper()
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1])
    return _parse_qualimap_metrics(report_file)
Beispiel #41
0
def _config_params(base_config, assoc_files, region, out_file):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    params += ["--fraction_contamination", contamination]
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = base_config["algorithm"].get("variant_regions")
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    return params
Beispiel #42
0
def _get_fgbio_options(data, umi_method):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-input-base-quality"]
    if umi_method != "paired":
        cons_opts += ["--min-reads", "--min-consensus-base-quality"]
    defaults = {
        "--min-reads": "1",
        "--min-map-q": "1",
        "--min-consensus-base-quality": "13",
        "--min-input-base-quality": "2",
        "--edits": "1"
    }
    ropts = config_utils.get_resources("fgbio",
                                       data["config"]).get("options", [])
    assert len(
        ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts])
    if umi_method != "paired":
        cons_out += " --output-per-base-tags=false"
    return group_out, cons_out
Beispiel #43
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (
                r"paste <({fastq_file} | paste - - - -) "
                r"<({pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (
                r"paste <(zcat {fastq_file} | paste - - - -) "
                r"<(zcat {pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = (
                "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | "
                "snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #44
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd()
    purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf = tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in([
        "config", "algorithm", "background", "cnv_reference", "purecn_normaldb"
    ], sample)
    mappingbiasfile = tz.get_in([
        "config", "algorithm", "background", "cnv_reference",
        "purecn_mapping_bias"
    ], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [
        rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage,
        "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb",
        normaldb, "--mapping-bias-file", mappingbiasfile, "--intervals",
        intervals, "--snp-blacklist", simple_repeat_bed, "--genome", genome,
        "--force", "--post-optimize", "--seed", "123", "--bootstrapn", "500",
        "--cores",
        dd.get_num_cores(sample)
    ]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"],
                                        normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
                env="base"), utils.get_R_exports(env="base"), " ".join(
                    [str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files = _get_purecn_files(paired,
                                                 work_dir,
                                                 require_exist=True)
    return out
Beispiel #45
0
def _get_options_from_config(config):
    opts = []
    resources = config_utils.get_resources("hisat2", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    return opts
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        f1r2_file = None
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                "--annotation", "ClippingRankSumTest", "--annotation",
                "DepthPerSampleHC"
            ]
            if gatk_type == "gatk4":
                params += ["--reference", ref_file]
            else:
                params += ["-R", ref_file]
            for a in annotation.get_gatk_annotations(
                    items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)

            if all(is_paired(bam)
                   for bam in align_bams) and ("mutect2_readmodel"
                                               in utils.get_in(
                                                   items[0], "config",
                                                   "tools_on")):
                orientation_filter = True
            else:
                orientation_filter = False

            if gatk_type == "gatk4" and orientation_filter:
                f1r2_file = "{}-f1r2.tar.gz".format(
                    utils.splitext_plus(out_file)[0])
                params += ["--f1r2-tar-gz", f1r2_file]

            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":

                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(
                    out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(
                    tx_out_file)

                if orientation_filter:
                    tx_f1r2_file = "{}-read-orientation-model.tar.gz"
                    tx_f1r2_file = tx_f1r2_file.format(
                        utils.splitext_plus(f1r2_file)[0])
                    tx_read_orient_cmd = _mutect2_read_filter(
                        broad_runner, f1r2_file, tx_f1r2_file)

                    filter_cmd = _mutect2_filter(broad_runner,
                                                 tx_raw_prefilt_file,
                                                 tx_raw_file, ref_file,
                                                 tx_f1r2_file)
                else:
                    filter_cmd = _mutect2_filter(broad_runner,
                                                 tx_raw_prefilt_file,
                                                 tx_raw_file, ref_file)
                if orientation_filter:
                    cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}"
                else:
                    cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #47
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {
                        "loftee": _get_loftee,
                        "maxentscan": _get_maxentscan,
                        "genesplicer": _get_genesplicer,
                        "spliceregion": _get_spliceregion
                    }
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                        or tz.get_in(
                            ("config", "algorithm", "clinical_reporting"),
                            data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #48
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl",
                                               data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                config_args, config_fields, prediction_fields = [], [], []
                if is_human:
                    plugin_fns = {
                        "dbnsfp": _get_dbnsfp,
                        "loftee": _get_loftee,
                        "dbscsnv": _get_dbscsnv,
                        "maxentscan": _get_maxentscan,
                        "genesplicer": _get_genesplicer
                    }
                    plugins = tz.get_in(
                        ("config", "resources", "vep", "plugins"), data,
                        ["dbnsfp", "loftee"])
                    for plugin in plugins:
                        plugin_args, plugin_fields = plugin_fns[plugin](data)
                        config_args += plugin_args
                        plugin_fields += plugin_fields
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    prediction_fields += ["PolyPhen", "SIFT"]
                    # Use HGVS by default, requires indexing the reference genome
                    config_args += [
                        "--hgvs", "--shift_hgvs", "1", "--fasta",
                        dd.get_ref_file(data)
                    ]
                    config_fields += ["HGVSc", "HGVSp"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                        or tz.get_in(
                            ("config", "algorithm", "clinical_reporting"),
                            data)):
                    config_args += ["--pick"]
                std_fields = [
                    "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL",
                    "Feature", "EXON"
                ] + prediction_fields + [
                    "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"
                ]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + config_fields)] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #49
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        return data

    star_path = config_utils.get_program("STAR", config)

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

    fastq_files = (" ".join([
        _unpack_fastq(fastq_file),
        _unpack_fastq(pair_file)
    ]) if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 ")
            if "oncofuse" in dd.get_fusion_caller(data):
                cmd += "--chimOutType Junctions "
            else:
                cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
Beispiel #50
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3,
                                         "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                if not hla_on(data) or needs_separate_hla(data):
                    bwa_cmd = _get_bwa_mem_cmd(data,
                                               out_file,
                                               ref_file,
                                               "-",
                                               with_hla=False)
                else:
                    bwa_cmd = _get_bwa_mem_cmd(data,
                                               out_file,
                                               ref_file,
                                               "-",
                                               with_hla=True)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "unset JAVA_HOME && "
                    "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
    data["work_bam"] = out_file
    hla_file = "HLA-" + out_file
    if needs_separate_hla(data) and not utils.file_exists(hla_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, hla_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data,
                                           hla_file,
                                           ref_file,
                                           "-",
                                           with_hla=True)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "unset JAVA_HOME && "
                    "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
        hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file,
                                  names, rg_info, data)
        data["hla_bam"] = hla_file
    return data
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem,
                tx_results_dir)
            cmd = (
                "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                "--skip-duplicated --skip-dup-mode 0 "
                "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                    or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [
                None, False, "None"
            ] else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(
                    bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()),
                   "Qualimap: %s" % dd.get_sample_name(data),
                   env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir,
                                           "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file)
    }
Beispiel #52
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if _use_spark(num_cores, gatk_type):
                params += [
                    "-T", "HaplotypeCallerSpark", "--spark-master",
                    "local[%s]" % num_cores, "--conf",
                    "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                    "--conf", "spark.driver.host=localhost", "--conf",
                    "spark.network.timeout=800", "--conf",
                    "spark.executor.heartbeatInterval=100"
                ]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += [
                "--annotation", "ClippingRankSumTest", "--annotation",
                "DepthPerSampleHC"
            ]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                if _supports_avx():
                    # Scale down HMM thread default to avoid overuse of cores
                    # https://github.com/bcbio/bcbio-nextgen/issues/2442
                    if gatk_type == "gatk4":
                        params += ["--native-pair-hmm-threads", "1"]
                    # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                    # GATK3 needs to be explicitly set
                    else:
                        params += [
                            "--pair_hmm_implementation",
                            "VECTOR_LOGLESS_CACHING"
                        ]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d)
                                            for d in items):
                is_joint = True
                if gatk_type == "gatk4":
                    params += ["--emit-ref-confidence", "GVCF"]
                else:
                    params += ["--emitRefConfidence", "GVCF"]
                    params += [
                        "--variant_index_type", "LINEAR",
                        "--variant_index_parameter", "128000"
                    ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += [
                        "-ploidy",
                        str(ploidy.get_ploidy(items, region))
                    ]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            if gatk_type == "gatk4":
                # GATK4 Spark calling does not support bgzipped output, use plain VCFs
                if is_joint and _use_spark(num_cores, gatk_type):
                    tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf")
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {
                "magnitude": 0.9 * num_cores,
                "direction": "increase"
            } if num_cores > 1 else None
            try:
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale,
                                      parallel_gc=_use_spark(
                                          num_cores, gatk_type))
            except subprocess.CalledProcessError as msg:
                # Spark failing on regions without any reads, write an empty VCF instead
                # https://github.com/broadinstitute/gatk/issues/4234
                if (_use_spark(num_cores, gatk_type) and str(
                        msg
                ).find("java.lang.UnsupportedOperationException: empty collection"
                       ) >= 0
                        and str(msg).find("at org.apache.spark.rdd.RDD") >= 0):
                    vcfutils.write_empty_vcf(
                        tx_out_file,
                        samples=[dd.get_sample_name(d) for d in items])
                else:
                    raise
            if tx_out_file.endswith(".vcf"):
                vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])

    # avoid bug in GATK where files can get output as non-compressed
    if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"):
        with open(out_file, "r") as in_handle:
            is_plain_text = in_handle.readline().startswith("##fileformat")
        if is_plain_text:
            text_out_file = out_file
            out_file = out_file.replace(".vcf.gz", ".vcf")
            shutil.move(text_out_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #53
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if num_cores > 1 and gatk_type == "gatk4":
                params += [
                    "-T", "HaplotypeCallerSpark", "--sparkMaster",
                    "local[%s]" % num_cores, "--conf",
                    "spark.local.dir=%s" % os.path.dirname(tx_out_file)
                ]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += [
                "--annotation", "ClippingRankSumTest", "--annotation",
                "DepthPerSampleHC"
            ]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4" and _supports_avx():
                    params += [
                        "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                    ]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d)
                                            for d in items):
                is_joint = True
                params += ["--emitRefConfidence", "GVCF"]
                if not gatk_type == "gatk4":
                    params += [
                        "--variant_index_type", "LINEAR",
                        "--variant_index_parameter", "128000"
                    ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += [
                        "-ploidy",
                        str(ploidy.get_ploidy(items, region))
                    ]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {
                "magnitude": 0.9 * num_cores,
                "direction": "increase"
            } if num_cores > 1 else None
            broad_runner.run_gatk(params,
                                  os.path.dirname(tx_out_file),
                                  memscale=memscale,
                                  parallel_gc=(num_cores > 1
                                               and gatk_type == "gatk4"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #54
0
def calculate(parallel, items, sysinfo, config, multiplier=1,
              max_multicore=None):
    """Determine cores and workers to use for this stage based on used programs.
    multiplier specifies the number of regions items will be split into during
    processing.
    max_multicore specifies an optional limit on the maximum cores. Can use to
    force single core processing during specific tasks.
    sysinfo specifies cores and memory on processing nodes, allowing us to tailor
    jobs for available resources.
    """
    assert len(items) > 0, "Finding job resources but no items to process"
    all_cores = []
    all_memory = []
    # Provide 100Mb of additional memory for the system
    system_memory = 0.10
    algs = [config_utils.get_algorithm_config(x) for x in items]
    progs = _get_resource_programs(parallel.get("progs", []), algs)
    # Calculate cores
    for prog in progs:
        resources = config_utils.get_resources(prog, config)
        all_cores.append(resources.get("cores", 1))
    if len(all_cores) == 0:
        all_cores.append(1)
    cores_per_job = max(all_cores)
    if max_multicore:
        cores_per_job = min(cores_per_job, max_multicore)
    if "cores" in sysinfo:
        cores_per_job = min(cores_per_job, int(sysinfo["cores"]))
    total = parallel["cores"]
    if total > cores_per_job:
        num_jobs = total // cores_per_job
    else:
        num_jobs, cores_per_job = 1, total

    # Calculate memory. Use 1Gb memory usage per core as min baseline if not specified
    for prog in progs:
        resources = config_utils.get_resources(prog, config)
        memory = _get_prog_memory(resources, cores_per_job)
        if memory:
            all_memory.append(memory)
    if len(all_memory) == 0:
        all_memory.append(1)
    memory_per_core = max(all_memory)

    logger.debug("Resource requests: {progs}; memory: {memory}; cores: {cores}".format(
        progs=", ".join(progs), memory=", ".join("%.2f" % x for x in all_memory),
        cores=", ".join(str(x) for x in all_cores)))

    cores_per_job, memory_per_core = _ensure_min_resources(progs, cores_per_job, memory_per_core,
                                                           min_memory=parallel.get("ensure_mem", {}))
    if cores_per_job == 1:
        memory_per_job = "%.2f" % memory_per_core
        num_jobs, mem_pct = _scale_jobs_to_memory(num_jobs, memory_per_core, sysinfo)
    else:
        cores_per_job, memory_per_job, mem_pct = _scale_cores_to_memory(cores_per_job,
                                                                        memory_per_core, sysinfo,
                                                                        system_memory)
        # For local runs with multiple jobs and multiple cores, potentially scale jobs down
        if num_jobs > 1 and parallel.get("type") == "local":
            memory_per_core = float(memory_per_job) / cores_per_job
            num_jobs, _ = _scale_jobs_to_memory(num_jobs, memory_per_core, sysinfo)

    # do not overschedule if we don't have extra items to process
    num_jobs = min(num_jobs, len(items) * multiplier)
    logger.debug("Configuring %d jobs to run, using %d cores each with %sg of "
                 "memory reserved for each job" % (num_jobs, cores_per_job,
                                                   str(memory_per_job)))
    parallel = copy.deepcopy(parallel)
    parallel["cores_per_job"] = cores_per_job
    parallel["num_jobs"] = num_jobs
    parallel["mem"] = str(memory_per_job)
    parallel["mem_pct"] = "%.2f" % mem_pct
    parallel["system_cores"] = sysinfo.get("cores", 1)
    return parallel
Beispiel #55
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl",
                                               data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                if tz.get_in(("config", "algorithm", "clinical_reporting"),
                             data, False):
                    # In case of clinical reporting, we need one and only one variant per gene
                    # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick
                    # Also use hgvs reporting but requires indexing the reference file
                    clinical_args = [
                        "--pick", "--hgvs", "--shift_hgvs", "1", "--fasta",
                        dd.get_ref_file(data)
                    ]
                    clinical_fields = ["HGVSc", "HGVSp"]
                else:
                    clinical_args, clinical_fields = [], []
                std_fields = [
                    "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL",
                    "Feature", "EXON"
                ] + prediction_fields + [
                    "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"
                ]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args + clinical_args

                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #56
0
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type,
              data):
    """Run variant quality score recalibration.
    """
    cutoffs = [
        "100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93",
        "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0",
        "98.0", "90.0"
    ]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    gatk_type = broad_runner.gatk_type()
    base = utils.splitext_plus(in_file)[0]
    recal_file = ("%s-vqsrrecal.vcf.gz" %
                  base) if gatk_type == "gatk4" else ("%s.recal" % base)
    tranches_file = "%s.tranches" % base
    plot_file = "%s-plots.R" % base
    if not utils.file_exists(recal_file):
        with file_transaction(data, recal_file, tranches_file,
                              plot_file) as (tx_recal, tx_tranches,
                                             tx_plot_file):
            params = [
                "-T", "VariantRecalibrator", "-R", ref_file, "--mode",
                filter_type
            ]
            if gatk_type == "gatk4":
                params += [
                    "--variant", in_file, "--output", tx_recal,
                    "--tranches-file", tx_tranches, "--rscript-file",
                    tx_plot_file
                ]
            else:
                params += [
                    "--input", in_file, "--recal_file", tx_recal,
                    "--tranches_file", tx_tranches, "--rscript_file",
                    tx_plot_file
                ]
            params += _get_vqsr_training(filter_type, vrn_files, gatk_type)
            resources = config_utils.get_resources("gatk_variant_recalibrator",
                                                   data["config"])
            opts = resources.get("options", [])
            if not opts:
                for cutoff in cutoffs:
                    opts += ["-tranche", str(cutoff)]
                for a in _get_vqsr_annotations(filter_type, data):
                    opts += ["-an", a]
            params += opts
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params,
                                      log_error=False,
                                      memscale=memscale,
                                      parallel_gc=True)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    if gatk_type == "gatk4":
        vcfutils.bgzip_and_index(recal_file, data["config"])
    return recal_file, tranches_file
Beispiel #57
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(
            vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(
            resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {
                "algorithm": {
                    "memory_adjust": {
                        "magnitude": threads,
                        "direction": "increase"
                    }
                }
            })
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (
            utils.local_path_export(), jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Beispiel #58
0
def _get_preseq_params(data, preseq_cmd, read_count):
    """ Get parameters through resources.
        If "step" or "extrap" limit are not provided, then calculate optimal values based on read count.
    """
    defaults = {
        'seg_len':
        100000,  # maximum segment length when merging paired end bam reads
        'steps': 300,  # number of points on the plot
        'extrap_fraction': 3,  # extrapolate up to X times read_count
        'extrap': None,  # extrapolate up to X reads
        'step': None,  # step size (number of reads between points on the plot)
        'options': '',
    }
    params = {}

    main_opts = [("-e", "-extrap"), ("-l", "-seg_len"), ("-s", "-step")]
    other_opts = config_utils.get_resources("preseq",
                                            data["config"]).get("options", [])
    if isinstance(other_opts, str):
        other_opts = [other_opts]
    for sht, lng in main_opts:
        if sht in other_opts:
            i = other_opts.index(sht)
        elif lng in other_opts:
            i = other_opts.index(lng)
        else:
            i = None
        if i is not None:
            params[lng[1:]] = other_opts[i + 1]
            other_opts = other_opts[:i] + other_opts[i + 2:]
    params['options'] = ' '.join(other_opts)
    for k, v in config_utils.get_resources("preseq", data["config"]).items():
        if k != 'options':
            params[k] = v

    params['steps'] = params.get('steps', defaults['steps'])

    if preseq_cmd == 'c_curve':
        params['extrap_fraction'] = 1

    else:
        if params.get('step') is None:
            if params.get('extrap') is None:
                unrounded__extrap = read_count * params.get(
                    'extrap_fraction', defaults['extrap_fraction'])
                unrounded__step = unrounded__extrap // params['steps']
                if params.get(
                        'extrap_fraction'
                ) is not None:  # extrap_fraction explicitly provided
                    params['extrap'] = unrounded__extrap
                    params['step'] = unrounded__step
                else:
                    power_of_10 = 10**math.floor(math.log(unrounded__step, 10))
                    rounded__step = int(
                        math.floor(unrounded__step // power_of_10) *
                        power_of_10)
                    rounded__extrap = int(rounded__step) * params['steps']
                    params['step'] = rounded__step
                    params['extrap'] = rounded__extrap
            else:
                params['step'] = params['extrap'] // params['steps']

        elif params.get('extrap') is None:
            params['extrap'] = params['step'] * params['steps']

    params['step'] = params.get('step', defaults['step'])
    params['extrap'] = params.get('extrap', defaults['extrap'])
    params['seg_len'] = params.get('seg_len', defaults['seg_len'])

    logger.info(
        "Preseq: running {steps} steps of size {step}, extap limit {extrap}".
        format(**params))
    return params
Beispiel #59
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs(
        [utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(
                            samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources(
                        "multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(
                os.path.join(out_dir, "report", "metrics",
                             dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(
            os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [
            data_files.add(f)
            for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))
        ]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {
            "base": out_file,
            "secondary": data_files
        }

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(
            samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                data_json_final)

        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                file_list_final)

    return [[data] for data in samples]
Beispiel #60
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join(
                [str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]