Example #1
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Example #2
0
def run(items):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    assert len(items) == 1, "Expect one input to MetaSV ensemble calling"
    data = items[0]
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    methods = []
    for call in data.get("sv", []):
        if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods:
            methods.append(call["variantcaller"])
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if len(methods) >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                             os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff")
        data["sv"].append({"variantcaller": "metasv",
                           "vrn_file": effects_vcf or filter_file})
    return [data]
Example #3
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + [
        "--sample",
        dd.get_sample_name(data),
        "--reference",
        dd.get_ref_file(data),
        "--bam",
        dd.get_align_bam(data),
        "--outdir",
        work_dir,
    ]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(
                dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")
            )
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        calls.append({"variantcaller": "metasv", "vrn_file": out_file})
    return calls
Example #4
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                            os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>10000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>20) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>1)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        calls.append({"variantcaller": "metasv",
                      "vrn_file": filter_file})
    return calls
Example #5
0
def gatk_cmd(name, jvm_opts, params):
    """Retrieve PATH to gatk or gatk-framework executable using locally installed java.
    """
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    # if we can't find via the local executable, fallback to being in the path
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
        return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \
            (os.path.dirname(gatk_cmd), gatk_cmd,
            " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #6
0
def gatk_cmd(name, jvm_opts, params):
    """Retrieve PATH to gatk or gatk-framework executable using locally installed java.
    """
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \
        (os.path.dirname(gatk_cmd), gatk_cmd,
         " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #7
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"], ["coverage"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir,
                                                                          "coverage", "prioritize",
                                                                          val + "*%s" % ext)))
            # Check sv-annotation directory for prioritize gene name lists
            if target[-1] == "svprioritize":
                installed_vals += glob.glob(os.path.join(
                    os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))),
                    "%s*" % os.path.basename(val)))
            if len(installed_vals) == 0:
                raise ValueError("Configuration problem. BED file not found for %s: %s" %
                                 (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val)
    return data
Example #8
0
def java(items):
    """Check for presence of external Java 1.7 for tools that require it.
    """
    if any([_needs_java(d) for d in items]):
        min_version = "1.7"
        max_version = "1.8"
        java = utils.which("java")
        if not java:
            return ("java not found on PATH. Java %s required for MuTect and GATK < 3.6." % min_version)
        p = subprocess.Popen([java, "-Xms250m", "-Xmx250m", "-version"],
                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        output, _ = p.communicate()
        p.stdout.close()
        version = ""
        for line in output.split("\n"):
            if line.startswith(("java version", "openjdk version")):
                version = line.strip().split()[-1]
                if version.startswith('"'):
                    version = version[1:]
                if version.endswith('"'):
                    version = version[:-1]
        if (not version or LooseVersion(version) >= LooseVersion(max_version) or
            LooseVersion(version) < LooseVersion(min_version)):
            return ("java version %s required for running MuTect and GATK < 3.6.\n"
                    "It needs to be first on your PATH so running 'java -version' give the correct version.\n"
                    "Found version %s at %s" % (min_version, version, java))
Example #9
0
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""):
    """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing.

    Commands for HLA post-processing:
       base=TEST
       run-HLA $base.hla > $base.hla.top
       cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all
       rm -f $base.hla.HLA*gt
       rm -f $base.hla.HLA*gz
    """
    alt_file = ref_file + ".alt"
    if utils.file_exists(alt_file):
        bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
        hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")),
                                os.path.basename(out_file) + ".hla")
        alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}")
    else:
        alt_cmd = ""
    bwa = config_utils.get_program("bwa", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(data["rgnames"])
    pairing = "-p" if not fastq2 else ""
    # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38
    # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/
    # http://ehc.ac/p/bio-bwa/mailman/message/32268544/
    mem_usage = "-c 250"
    bwa_cmd = ("{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 "
               "{ref_file} {fastq1} {fastq2} ")
    return (bwa_cmd + alt_cmd).format(**locals())
Example #10
0
def _gatk4_cmd(jvm_opts, params, data):
    """Retrieve unified command for GATK4, using 'gatk'. GATK3 is 'gatk3'.
    """
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk"))
    return "%s && export PATH=%s:\"$PATH\" && gatk --java-options '%s' %s" % \
        (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd),
         " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #11
0
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename,
                              build, out_dir, cores=1):
    """
    works only in unstranded mode for now (--forward-prob 0.5)
    """
    if not utils.which("rsem-calculate-expression"):
        logger.info("Skipping RSEM because rsem-calculate-expression could "
                    "not be found.")
        return None

    sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results")
    if utils.file_exists(sentinel_file):
        return out_dir

    paired_flag = "--paired" if bam.is_paired(bam_file) else ""
    core_flag = "-p {cores}".format(cores=cores)
    command = CALCULATE_EXP.format(
        core_flag=core_flag, paired_flag=paired_flag, bam_file=bam_file,
        rsem_genome_dir=rsem_genome_dir, build=build, samplename=samplename)
    message = "Calculating transcript expression of {bam_file} using RSEM."

    with transaction.file_transaction(out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        with utils.chdir(tx_out_dir):
            do.run(command, message.format(bam_file=bam_file))
    return out_dir
Example #12
0
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build,
                              out_dir, cores=1):
    """
    works only in unstranded mode for now (--forward-prob 0.5)
    """
    if not which("rsem-calculate-expression"):
        logger.info("Skipping RSEM because rsem-calculate-expression could "
                    "not be found.")
        return None

    sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results")
    if file_exists(sentinel_file):
        return out_dir

    paired_flag = "--paired" if bam.is_paired(bam_file) else ""
    core_flag = "-p {cores}".format(cores=cores)
    cmd = ("rsem-calculate-expression --bam {core_flag} {paired_flag} --no-bam-output "
           "--forward-prob 0.5 --estimate-rspd {bam_file} {rsem_genome_dir}/{build} "
           "{samplename}")
    message = "Calculating transcript expression of {bam_file} using RSEM."
    with file_transaction(out_dir) as tx_out_dir:
        safe_makedir(tx_out_dir)
        with chdir(tx_out_dir):
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_dir
Example #13
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Example #14
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            do.run(cmd, "Combine variant calls with MetaSV")
        calls.append({"variantcaller": "metasv",
                      "vrn_file": out_file})
    return calls
Example #15
0
def gatk_cmd(name, jvm_opts, params, config=None):
    """Retrieve PATH to gatk or gatk-framework executable using locally installed java.
    """
    if name == "gatk":
        assert config, "Need configuration input for gatk to distinguish gatk4"
        if isinstance(config, dict) and "config" not in config:
            data = {"config": config}
        else:
            data = config
        if "gatk4" in dd.get_tools_on(data):
            return _gatk4_cmd(jvm_opts, params, data)
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    # if we can't find via the local executable, fallback to being in the path
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
        return "unset JAVA_HOME && export PATH=%s:$PATH && %s %s %s" % \
            (os.path.dirname(gatk_cmd), gatk_cmd,
             " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #16
0
def _find_executable(name):
    in_path = utils.which(name)
    if in_path:
        return in_path
    else:
        in_conda = os.path.join(os.path.dirname(sys.executable), name)
        if os.path.exists(in_conda):
            return in_conda
        else:
            return None
Example #17
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file):
        cmd = [sys.executable, utils.which("configManta.py")]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Example #18
0
def gatk_cmd(name, jvm_opts, params, config=None):
    """Retrieve PATH to gatk using locally installed java.
    """
    if name == "gatk":
        if isinstance(config, dict) and "config" not in config:
            data = {"config": config}
        else:
            data = config
        if not data or "gatk4" not in dd.get_tools_off(data):
            return _gatk4_cmd(jvm_opts, params, data)
        else:
            name = "gatk3"
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    # if we can't find via the local executable, fallback to being in the path
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
        return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \
            (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd,
             " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #19
0
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))]
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % _get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file),
            "--runDir=%s" % tx_work_dir,
            "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
    if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]:
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name)
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #20
0
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))]
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % _get_region_bed(region, items, out_file),
            "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file),
            "--runDir=%s" % tx_work_dir]
    cmd += ["--bam=%s" % b for b in align_bams]
    if any(dd.get_coverage_interval(d) not in ["genome"] for d in items):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #21
0
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [utils.get_program_python("configureStrelkaSomaticWorkflow.py"),
           os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))]
    cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file)
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % cur_bed,
            "--runDir=%s" % tx_work_dir,
            "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
    if _is_targeted_region(cur_bed, paired.tumor_data):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name)
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #22
0
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"),
           os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))]
    cur_bed = get_region_bed(region, items, out_file)
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % cur_bed,
            "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file),
            "--runDir=%s" % tx_work_dir]
    cmd += ["--bam=%s" % b for b in align_bams]
    if _is_targeted_region(cur_bed, items[0]):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #23
0
def run(data):
    """HLA typing with bwakit, parsing output from called genotype files.
    """
    bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
    align_file = dd.get_align_bam(data)
    hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(align_file), "hla")),
                            os.path.basename(align_file) + ".hla")
    if len(glob.glob(hla_base + ".*")) > 0:
        out_file = hla_base + ".top"
        if not utils.file_exists(out_file):
            cmd = "{bwakit_dir}/run-HLA {hla_base}"
            #do.run(cmd.format(**locals()), "HLA typing with bwakit")
            out_file = _organize_calls(out_file, hla_base, data)
        data["hla"] = {"calls": out_file}
    return data
Example #24
0
def run(data):
    """HLA typing with bwakit, parsing output from called genotype files.
    """
    bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
    hla_fqs = tz.get_in(["hla", "fastq"], data, [])
    if len(hla_fqs) > 0:
        hla_base = os.path.commonprefix(hla_fqs)
        while hla_base.endswith("."):
            hla_base = hla_base[:-1]
        out_file = hla_base + ".top"
        if not utils.file_exists(out_file):
            cmd = "{bwakit_dir}/run-HLA {hla_base}"
            do.run(cmd.format(**locals()), "HLA typing with bwakit")
            out_file = _organize_calls(out_file, hla_base, data)
        data["hla"].update({"call_file": out_file,
                            "hlacaller": "bwakit"})
    return data
Example #25
0
def prepare_rsem_reference(gtf, multifasta, build):
    """
    gtf: path to GTF file (must have gene_id and transcript_id)
    multifasta: path to multifasta file
    build: name of organism build (e.g. hg19)
    """
    if not which("rsem-prepare-reference"):
        logger.info("Skipping prepping RSEM reference because rsem-prepare-reference could "
                    "not be found.")
        return None

    cmd = "rsem-prepare-reference --gtf {gtf} {multifasta} {build}"
    with tx_tmpdir(remove=False) as rsem_genome_dir:
        with chdir(rsem_genome_dir):
            message = "Preparing rsem reference from %s" % gtf
            do.run(cmd.format(**locals()), message)
    return rsem_genome_dir
Example #26
0
def prepare_rsem_reference(gtf, multifasta, build):
    """
    gtf: path to GTF file (must have gene_id and transcript_id)
    multifasta: path to multifasta file
    build: name of organism build (e.g. hg19)
    """
    if not utils.which("rsem-prepare-reference"):
        logger.info("Skipping prepping RSEM reference because "
                    "rsem-prepare-reference could not be found.")
        return None

    command = PREPARE_REFERENCE.format(gtf=gtf, multifasta=multifasta,
                                       build=build)
    with transaction.tx_tmpdir(remove=False) as rsem_genome_dir:
        with utils.chdir(rsem_genome_dir):
            message = "Preparing rsem reference from %s" % gtf
            do.run(command, message)
    return rsem_genome_dir
Example #27
0
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [
        sys.executable,
        os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))
    ]
    cmd += [
        "--referenceFasta=%s" % ref_file,
        "--callRegions=%s" % _get_region_bed(
            region, [paired.tumor_data, paired.normal_data], out_file),
        "--runDir=%s" % tx_work_dir,
        "--normalBam=%s" % paired.normal_bam,
        "--tumorBam=%s" % paired.tumor_bam
    ]
    if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]:
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name)
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #28
0
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""):
    """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing.

    Commands for HLA post-processing:
       base=TEST
       run-HLA $base.hla > $base.hla.top
       cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all
       rm -f $base.hla.HLA*gt
       rm -f $base.hla.HLA*gz
    """
    alt_file = ref_file + ".alt"
    if utils.file_exists(alt_file):
        bwakit_dir = os.path.dirname(
            os.path.realpath(utils.which("run-bwamem")))
        hla_base = os.path.join(
            utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")),
            os.path.basename(out_file) + ".hla")
        alt_cmd = (
            " | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}"
        )
    else:
        alt_cmd = ""
    if dd.get_aligner(data) == "sentieon-bwa":
        bwa_exe = "sentieon-bwa"
        exports = sentieon.license_export(data)
    else:
        bwa_exe = "bwa"
        exports = ""
    bwa = config_utils.get_program(bwa_exe, data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(data["rgnames"])
    pairing = "-p" if not fastq2 else ""
    # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38
    # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/
    # http://ehc.ac/p/bio-bwa/mailman/message/32268544/
    mem_usage = "-c 250"
    bwa_cmd = (
        "{exports}{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 "
        "{ref_file} {fastq1} {fastq2} ")
    return (bwa_cmd + alt_cmd).format(**locals())
Example #29
0
def prepare_rsem_reference(gtf, multifasta, build):
    """
    gtf: path to GTF file (must have gene_id and transcript_id)
    multifasta: path to multifasta file
    build: name of organism build (e.g. hg19)
    """
    if not utils.which("rsem-prepare-reference"):
        logger.info("Skipping prepping RSEM reference because "
                    "rsem-prepare-reference could not be found.")
        return None

    command = PREPARE_REFERENCE.format(gtf=gtf,
                                       multifasta=multifasta,
                                       build=build)
    with transaction.tx_tmpdir(remove=False) as rsem_genome_dir:
        with utils.chdir(rsem_genome_dir):
            message = "Preparing rsem reference from %s" % gtf
            do.run(command, message)
    return rsem_genome_dir
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"], ["coverage"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(
                    os.path.normpath(
                        os.path.join(os.path.dirname(ref_file), os.pardir,
                                     "coverage", "prioritize",
                                     val + "*%s" % ext)))
            # Check sv-annotation directory for prioritize gene name lists
            if target[-1] == "svprioritize":
                installed_vals += glob.glob(
                    os.path.join(
                        os.path.dirname(
                            os.path.realpath(
                                utils.which("simple_sv_annotation.py"))),
                        "%s*" % os.path.basename(val)))
            if len(installed_vals) == 0:
                raise ValueError(
                    "Configuration problem. BED file not found for %s: %s" %
                    (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target,
                                lambda x: installed_val)
    return data
Example #31
0
def _configure_germline(align_bams, items, ref_file, region, out_file,
                        tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [
        sys.executable,
        os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))
    ]
    cmd += [
        "--referenceFasta=%s" % ref_file,
        "--callRegions=%s" % _get_region_bed(region, items, out_file),
        "--ploidy=%s" % _get_ploidy(region, items, out_file),
        "--runDir=%s" % tx_work_dir
    ]
    cmd += ["--bam=%s" % b for b in align_bams]
    if any(dd.get_coverage_interval(d) not in ["genome"] for d in items):
        cmd += ["--targeted"]
    do.run(
        cmd, "Configure Strelka2 germline calling: %s" %
        (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #32
0
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [
        utils.get_program_python("configureStrelkaSomaticWorkflow.py"),
        os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))
    ]
    cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data],
                             out_file)
    if cur_bed:
        cmd += [
            "--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % cur_bed,
            "--runDir=%s" % tx_work_dir,
            "--normalBam=%s" % paired.normal_bam,
            "--tumorBam=%s" % paired.tumor_bam
        ]
        if _is_targeted_region(cur_bed, paired.tumor_data):
            cmd += ["--targeted"]
        do.run(cmd,
               "Configure Strelka2 germline calling: %s" % paired.tumor_name)
        return os.path.join(tx_work_dir, "runWorkflow.py")
Example #33
0
def _configure_germline(align_bams, items, ref_file, region, out_file,
                        tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [
        sys.executable,
        os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))
    ]
    cur_bed = get_region_bed(region, items, out_file)
    cmd += [
        "--referenceFasta=%s" % ref_file,
        "--callRegions=%s" % cur_bed,
        "--ploidy=%s" %
        _get_ploidy(shared.to_multiregion(region), items, out_file),
        "--runDir=%s" % tx_work_dir
    ]
    cmd += ["--bam=%s" % b for b in align_bams]
    if _is_targeted_region(cur_bed, items[0]):
        cmd += ["--targeted"]
    do.run(
        cmd, "Configure Strelka2 germline calling: %s" %
        (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
Example #34
0
def _create_pileup(bam_file, data, out_base, background):
    """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping.
    """
    out_file = "%s-mpileup.txt" % out_base
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            background_bed = os.path.normpath(os.path.join(
                os.path.dirname(os.path.realpath(utils.which("verifybamid2"))),
                "resource", "%s.%s.%s.vcf.gz.dat.bed" % (background["dataset"],
                                                         background["nvars"], background["build"])))
            local_bed = os.path.join(os.path.dirname(out_base),
                                     "%s.%s-hg19.bed" % (background["dataset"], background["nvars"]))
            if not utils.file_exists(local_bed):
                with file_transaction(data, local_bed) as tx_local_bed:
                    with open(background_bed) as in_handle:
                        with open(tx_local_bed, "w") as out_handle:
                            for line in in_handle:
                                out_handle.write("chr%s" % line)
            mpileup_cl = samtools.prep_mpileup([bam_file], dd.get_ref_file(data), data["config"], want_bcf=False,
                                                target_regions=local_bed)
            cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}")
            do.run(cl.format(**locals()), "Create pileup from BAM input")
    return out_file
Example #35
0
def _run_funnel(args):
    """Run funnel TES server with rabix bunny for CWL.
    """
    host = "localhost"
    port = "8088"
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "funnel_work"))
    log_file = os.path.join(work_dir, "%s-funnel.log" % project_name)
    # Create bunny configuration directory with TES backend
    orig_config_dir = os.path.join(os.path.dirname(os.path.realpath(utils.which("rabix"))), "config")
    work_config_dir = utils.safe_makedir(os.path.join(work_dir, "rabix_config"))
    for fname in os.listdir(orig_config_dir):
        if fname == "core.properties":
            with open(os.path.join(orig_config_dir, fname)) as in_handle:
                with open(os.path.join(work_config_dir, fname), "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("backend.embedded.types"):
                            line = "backend.embedded.types=TES\n"
                        out_handle.write(line)
        else:
            shutil.copy(os.path.join(orig_config_dir, fname), os.path.join(work_config_dir, fname))
    flags = ["-c", work_config_dir,
             "-tes-url=http://%s:%s" % (host, port), "-tes-storage=%s" % work_dir]
    if args.no_container:
        _remove_bcbiovm_path()
        flags += ["--no-container"]
    cmd = ["rabix"] + flags + [main_file, json_file]
    funnelp = subprocess.Popen(["funnel", "server", "run",
                                "--Server.HostName", host, "--Server.HTTPPort", port,
                                "--LocalStorage.AllowedDirs", work_dir,
                                "--Worker.WorkDir", os.path.join(work_dir, "funnel-work")])
    try:
        with utils.chdir(work_dir):
            _run_tool(cmd, not args.no_container, work_dir, log_file)
    finally:
        funnelp.kill()
Example #36
0
def _run_funnel(args):
    """Run funnel TES server with rabix bunny for CWL.
    """
    host = "localhost"
    port = "8088"
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "funnel_work"))
    log_file = os.path.join(work_dir, "%s-funnel.log" % project_name)
    # Create bunny configuration directory with TES backend
    orig_config_dir = os.path.join(os.path.dirname(os.path.realpath(utils.which("rabix"))), "config")
    work_config_dir = utils.safe_makedir(os.path.join(work_dir, "rabix_config"))
    for fname in os.listdir(orig_config_dir):
        if fname == "core.properties":
            with open(os.path.join(orig_config_dir, fname)) as in_handle:
                with open(os.path.join(work_config_dir, fname), "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("backend.embedded.types"):
                            line = "backend.embedded.types=TES\n"
                        out_handle.write(line)
        else:
            shutil.copy(os.path.join(orig_config_dir, fname), os.path.join(work_config_dir, fname))
    flags = ["-c", work_config_dir,
             "-tes-url=http://%s:%s" % (host, port), "-tes-storage=%s" % work_dir]
    if args.no_container:
        _remove_bcbiovm_path()
        flags += ["--no-container"]
    cmd = ["rabix"] + flags + [main_file, json_file]
    funnelp = subprocess.Popen(["funnel", "server", "run",
                                "--Server.HostName", host, "--Server.HTTPPort", port,
                                "--LocalStorage.AllowedDirs", work_dir,
                                "--Worker.WorkDir", os.path.join(work_dir, "funnel-work")])
    try:
        with utils.chdir(work_dir):
            _run_tool(cmd, not args.no_container, work_dir, log_file)
    finally:
        funnelp.kill()
Example #37
0
 def _get_ericscript_path(self):
     """Retrieve PATH to the isolated eriscript anaconda environment.
     """
     es = utils.which(os.path.join(utils.get_bcbio_bin(), self.EXECUTABLE))
     return os.path.dirname(os.path.realpath(es))
Example #38
0
def _get_cmd():
    return [
        utils.get_program_python("run_metasv.py"),
        utils.which("run_metasv.py")
    ]
Example #39
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir,
                    data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir,
                            "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir,
                              "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file,
                                   priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(
                    ".vcf.gz", ".vcf"),
                                         data["config"],
                                         remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources(
                        "bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(
                        jvm_opts, {
                            "algorithm": {
                                "memory_adjust": {
                                    "direction": "increase",
                                    "maximum": "30000M",
                                    "magnitude": dd.get_cores(data)
                                }
                            }
                        })
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = (
                        "{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                        " -k {prioritize_by}")
                    do.run(cmd.format(**locals()),
                           "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(
            os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(
                    data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()),
                   "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(
        vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = (
                "{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                "print CALLER,SNAME,$1,$2,I$END,"
                """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                "I$LOF,I$SIMPLE_ANN,"
                "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Example #40
0
def _choose_htseq_count_executable(data):
    htseq = get_in(data["config"], ("resources", "htseq-count", "cmd"), "htseq-count")
    return which(htseq)
Example #41
0
def _gatk4_cmd(jvm_opts, params, data):
    """Retrieve unified command for GATK4, using gatk-launch.
    """
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk-launch"))
    return "unset JAVA_HOME && export PATH=%s:$PATH && gatk-launch --javaOptions '%s' %s" % \
        (os.path.dirname(gatk_cmd), " ".join(jvm_opts), " ".join([str(x) for x in params]))
Example #42
0
def _choose_htseq_count_executable(data):
    htseq = get_in(data["config"], ("resources", "htseq-count", "cmd"),
                   "htseq-count")
    return which(htseq)
Example #43
0
 def _get_ericscript_path(self):
     """Retrieve PATH to the isolated eriscript anaconda environment.
     """
     es = utils.which(os.path.join(utils.get_bcbio_bin(), self.EXECUTABLE))
     return os.path.dirname(os.path.realpath(es))