Beispiel #1
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Beispiel #2
0
def _run_workflow(data, workflow_file, work_dir):
    """Run Strelka2 analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"),
           workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"]
    do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data))
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Beispiel #3
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    data = paired.tumor_data if paired else items[0]
    cmd = [utils.get_program_python("configManta.py"), workflow_file, "-m", "local", "-j", dd.get_num_cores(data)]
    do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
def _tophat_major_version(config):
    cmd =  [
        utils.get_program_python("tophat"),
        config_utils.get_program("tophat", config, default="tophat"),
        "--version"
    ]

    # tophat --version returns strings like this: Tophat v2.0.4
    version_string = str(subprocess.check_output(cmd)).strip().split()[1]
    major_version = int(version_string.split(".")[0][1:])
    return major_version
def prepare_dexseq(gtf):
    out_file = os.path.splitext(gtf)[0] + ".dexseq.gff3"
    if file_exists(out_file):
        return out_file

    dexseq_path = _dexseq_preparation_path()
    if not dexseq_path:
        return None
    executable = get_program_python("htseq-count")
    cmd = "{executable} {dexseq_path} {gtf} {out_file}"
    subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #6
0
def prepare_dexseq(gtf):
    out_file = os.path.splitext(gtf)[0] + ".dexseq.gff3"
    if file_exists(out_file):
        return out_file

    dexseq_path = _dexseq_preparation_path()
    if not dexseq_path:
        return None
    executable = get_program_python("htseq-count")
    cmd = "{executable} {dexseq_path} {gtf} {out_file}"
    subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #7
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    data = paired.tumor_data if paired else items[0]
    cmd = [
        utils.get_program_python("configManta.py"), workflow_file, "-m",
        "local", "-j",
        dd.get_num_cores(data)
    ]
    do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Beispiel #8
0
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [utils.get_program_python("configureStrelkaSomaticWorkflow.py"),
           os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))]
    cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file)
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % cur_bed,
            "--runDir=%s" % tx_work_dir,
            "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
    if _is_targeted_region(cur_bed, paired.tumor_data):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name)
    return os.path.join(tx_work_dir, "runWorkflow.py")
def make_hisat2_splicesites(gtf_file):
    base, _ = os.path.splitext(gtf_file)
    out_file = os.path.join(base + "-splicesites.txt")
    executable = get_program_python("hisat2")
    hisat2_script = os.path.join(os.path.dirname(executable),
                                 "hisat2_extract_splice_sites.py")
    cmd = "{executable} {hisat2_script} {gtf_file} > {out_file}"
    if file_exists(out_file):
        return out_file
    if not file_exists(hisat2_script):
        return None
    subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #10
0
def make_hisat2_splicesites(gtf_file):
    base, _ = os.path.splitext(gtf_file)
    out_file = os.path.join(base + "-splicesites.txt")
    executable = get_program_python("hisat2")
    hisat2_script = os.path.join(os.path.dirname(executable),
                                 "hisat2_extract_splice_sites.py")
    cmd = "{executable} {hisat2_script} {gtf_file} > {out_file}"
    if file_exists(out_file):
        return out_file
    if not file_exists(hisat2_script):
        return None
    subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #11
0
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir):
    utils.safe_makedir(tx_work_dir)
    cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"),
           os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))]
    cur_bed = get_region_bed(region, items, out_file)
    cmd += ["--referenceFasta=%s" % ref_file,
            "--callRegions=%s" % cur_bed,
            "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file),
            "--runDir=%s" % tx_work_dir]
    cmd += ["--bam=%s" % b for b in align_bams]
    if _is_targeted_region(cur_bed, items[0]):
        cmd += ["--targeted"]
    do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items])))
    return os.path.join(tx_work_dir, "runWorkflow.py")
def _sam_to_grouped_umi_cl(data, umi_consensus, tx_out_file):
    """Mark duplicates on aligner output and convert to grouped UMIs by position.

    Works with either a separate umi_file or UMI embedded in the read names.
    """
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tmp_file), 1)
    cores, mem = _get_cores_memory(data)
    bamsormadup = config_utils.get_program("bamsormadup", data)
    cmd = ("{bamsormadup} tmpfile={tmp_file}-markdup inputformat=sam threads={cores} outputformat=bam "
           "level=0 SO=coordinate | ")
    # UMIs in a separate file
    if os.path.exists(umi_consensus) and os.path.isfile(umi_consensus):
        cmd += "fgbio {jvm_opts} AnnotateBamWithUmis -i /dev/stdin -f {umi_consensus} -o {tx_out_file}"
    # UMIs embedded in read name
    else:
        cmd += ("%s %s bamtag - | samtools view -b > {tx_out_file}" %
                (utils.get_program_python("umis"),
                 config_utils.get_program("umis", data["config"])))
    return cmd.format(**locals())
Beispiel #13
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which(
        "configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += [
                    "--normalBam=%s" % paired.normal_bam,
                    "--tumorBam=%s" % paired.tumor_bam
                ]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += [
            "--referenceFasta=%s" % dd.get_ref_file(data),
            "--runDir=%s" % work_dir
        ]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += [
                "--config",
                _prep_streamlined_config(config_script, work_dir)
            ]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Beispiel #14
0
def _get_cmd():
    return [
        utils.get_program_python("run_metasv.py"),
        utils.which("run_metasv.py")
    ]
Beispiel #15
0
def _umis_cmd(data):
    """Return umis command line argument, with correct python and locale.
    """
    return "%s %s %s" % (
        utils.locale_export(), utils.get_program_python("umis"),
        config_utils.get_program("umis", data["config"], default="umis"))
Beispiel #16
0
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, data)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, data)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError("Tophat versions < 2.0 are not supported, please "
                                  "download the newest version of Tophat here: "
                                  "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file,
                                                        ref_file, out_base,
                                                        tx_out_dir, data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            cmd = [utils.get_program_python("tophat"), config_utils.get_program("tophat", config)]
            for k, v in options.items():
                if v is True:
                    cmd.append("--%s" % k)
                else:
                    assert not isinstance(v, bool)
                    cmd.append("--%s=%s" % (k, v))
            # tophat requires options before arguments, otherwise it silently ignores them
            cmd += files
            do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file))
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out