Esempio n. 1
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(program_name, config, "dir")
     # not configured
     except ValueError:
         return ""
     jar = os.path.basename(config_utils.get_jar(jar_name, pdir))
     for to_remove in [jar_name, ".jar", "-standalone"]:
         jar = jar.replace(to_remove, "")
     if jar.startswith(("-", ".")):
         jar = jar[1:]
     if not jar:
         logger.warn("Unable to determine version for program '{}' from jar file {}".format(
             program_name, config_utils.get_jar(jar_name, pdir)))
     return jar
Esempio n. 2
0
def illumina_qual_bin(in_file, ref_file, out_dir, config):
    """Uses CRAM to perform Illumina 8-bin approaches to existing BAM files.

    Bins quality scores according to Illumina scheme:

    http://www.illumina.com/Documents/products/whitepapers/whitepaper_datacompression.pdf

    Also fixes output header to remove extra run groups added by CRAM during conversion.
    """
    index_file = ref_file + ".fai"
    assert os.path.exists(index_file), "Could not find FASTA reference index: %s" % index_file
    out_file = os.path.join(out_dir, "%s-qualbin%s" % os.path.splitext(os.path.basename(in_file)))
    cram_jar = config_utils.get_jar("cramtools",
                                    config_utils.get_program("cram", config, "dir"))
    samtools = config_utils.get_program("samtools", config)
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            orig_header = "%s-header.sam" % os.path.splitext(out_file)[0]
            header_cmd = "{samtools} view -H -o {orig_header} {in_file}"
            cmd = ("java -jar {cram_jar} cram --input-bam-file {in_file} "
                   " --reference-fasta-file {ref_file} --preserve-read-names "
                   " --capture-all-tags --lossy-quality-score-spec '*8' "
                   "| java -jar {cram_jar} bam --output-bam-format "
                   "  --reference-fasta-file {ref_file} "
                   "| {samtools} reheader {orig_header} - "
                   "> {tx_out_file}")
            logger.info("Quality binning with CRAM")
            subprocess.check_call(header_cmd.format(**locals()), shell=True)
            subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Esempio n. 3
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Esempio n. 4
0
def run(data):
    #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse_jar = config_utils.get_jar("Oncofuse",
                                        config_utils.get_program("oncofuse", config, "dir"))

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Esempio n. 5
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config")
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                            "magnitude": config["algorithm"].get("num_cores", 1)}
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"]
            cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file,
                                                     tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 6
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = ("{mpileup} | {remove_zerocoverage} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
       write_empty_vcf(out_file)
Esempio n. 7
0
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position.
    """
    out_file = "%s-prep%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        bv_jar = config_utils.get_jar(
            "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")
        )
        resources = config_utils.get_resources("bcbio_variation", data["config"])
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        cmd = (
            ["java"]
            + jvm_opts
            + [
                "-jar",
                bv_jar,
                "variant-utils",
                "sort-vcf",
                vcf_file,
                tz.get_in(["reference", "fasta", "base"], data),
                "--sortpos",
            ]
        )
        do.run(cmd, "Sort VCF by reference")
    return out_file
Esempio n. 8
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    cmd = ["java"] + jvm_opts + broad.get_default_jvm_opts(tmp_dir) + ["-jar", bv_jar, "variant-compare", config_file]
    do.run(cmd, "Comparing variant calls using bcbio.variation", data)
Esempio n. 9
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(pname, config, "dir")
     except ValueError:
         return ""
     jar = config_utils.get_jar(jar_name, pdir)
     kwargs["cmd"] = "java"
     kwargs["args"] = "-Xms128m -Xmx256m -jar %s" % jar
     return _get_cl_version(kwargs, config)
Esempio n. 10
0
def trim_adapters(fastq_files, dirs, config):
    QUALITY_CUTOFF = 5
    to_trim = _get_sequences_to_trim(config, ALIENTRIMMER_ADAPTERS)
    resources = config_utils.get_resources("AlienTrimmer", config)
    try:
        jarpath = config_utils.get_program("AlienTrimmer", config, "dir")
    # fall back on Cutadapt if AlienTrimmer is not installed
    # XXX: remove after it has been live for a while
    except:
        return trim_read_through(fastq_files, dirs, config)
    jarfile = config_utils.get_jar("AlienTrimmer", jarpath)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]))
    base_cmd = ("java -jar {jvm_opts} {jarfile} -k 10 -l 20 ")
    fastq1 = fastq_files[0]
    supplied_quality_format = _get_quality_format(config)
    cores = config["algorithm"].get("num_cores", 0)
    out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs)
    fastq1_out = out_files[0]
    if supplied_quality_format == "illumina":
        quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][0]
    else:
        quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][1]
    quality_flag = '-q ' + quality_flag
    if len(fastq_files) == 1:
        if file_exists(fastq1_out):
            return [fastq1_out]
        base_cmd += ("-i {fastq1} -o {tx_fastq1_out} -c {temp_file} "
                     "{quality_flag}")
        message = "Trimming %s from %s with AlienTrimmer." % (to_trim, fastq1)
    else:
        fastq2 = fastq_files[1]
        fastq2_out = out_files[1]
        if all(map(file_exists, [fastq1_out, fastq2_out])):
            return [fastq1_out, fastq2_out]
        base_cmd += ("-if {fastq1} -ir {fastq2} -of {tx_fastq1_out} "
                     "-or {tx_fastq2_out} -c {temp_file} {quality_flag}")
        message = ("Trimming %s from %s and %s with AlienTrimmer."
                   % (to_trim, fastq1, fastq2))
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        temp_file = temp.name
        for adapter in to_trim:
            temp.write(adapter + "\n")
        temp.close()


    if len(fastq_files) == 1:
        with file_transaction(fastq1_out) as tx_fastq1_out:
            do.run(base_cmd.format(**locals()), message)
        return [fastq1_out]
    else:
        with file_transaction([fastq1_out, fastq2_out]) as tx_out_files:
            tx_fastq1_out = tx_out_files[0]
            tx_fastq2_out = tx_out_files[1]
            do.run(base_cmd.format(**locals()), message)
        return [fastq1_out, fastq2_out]
Esempio n. 11
0
def rnaseqc_runner_from_config(config):
    """
    get a runner for Broad's RNA-SeQC tool using a bcbio-nextgen config dict to
    configure it
    """
    resources = config_utils.get_resources("rnaseqc", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    bwa_path = config_utils.get_program("bwa", config)
    rnaseqc_dir = config_utils.get_program("rnaseqc", config, "dir")
    rnaseqc_path = config_utils.get_jar("RNA-SeQC", rnaseqc_dir)
    return RNASeQCRunner(rnaseqc_path, bwa_path, jvm_opts)
Esempio n. 12
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar(
        "bcbio.variation",
        config_utils.get_program("bcbio_variation", data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    cmd = ["java"] + jvm_opts + broad.get_default_jvm_opts(tmp_dir) + \
          ["-jar", bv_jar, "variant-compare", config_file]
    do.run(cmd, "Comparing variant calls using bcbio.variation", data)
Esempio n. 13
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(program_name, config, "dir")
     # not configured
     except ValueError:
         return ""
     jar = os.path.basename(config_utils.get_jar(jar_name, pdir))
     for to_remove in [jar_name, ".jar", "-standalone"]:
         jar = jar.replace(to_remove, "")
     if jar.startswith(("-", ".")):
         jar = jar[1:]
     return jar
Esempio n. 14
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(program_name, config, "dir")
     # not configured
     except ValueError:
         return ""
     jar = os.path.basename(config_utils.get_jar(jar_name, pdir))
     for to_remove in [jar_name, ".jar", "-standalone"]:
         jar = jar.replace(to_remove, "")
     if jar.startswith(("-", ".")):
         jar = jar[1:]
     return jar
Esempio n. 15
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar("bcbio.variation",
                                  config_utils.get_program("bcbio_variation",
                                                           data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-compare", config_file]
    log_cmd("Comparing variant calls using bcbio.variation", data, " ".join(cmd))
    subprocess.check_call(cmd)
Esempio n. 16
0
def get_cmd(cmd_name, datadir, config):
    """Retrieve snpEff base command line, handling command line and jar based installs.
    """
    resources = config_utils.get_resources("snpeff", config)
    memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]))
    try:
        snpeff = config_utils.get_program("snpEff", config)
        cmd = "{snpeff} {memory} {cmd_name} -dataDir {datadir}"
    except config_utils.CmdNotFound:
        snpeff_jar = config_utils.get_jar("snpEff", config_utils.get_program("snpeff", config, "dir"))
        config_file = "%s.config" % os.path.splitext(snpeff_jar)[0]
        cmd = "java {memory} -jar {snpeff_jar} {cmd_name} -c {config_file} -dataDir {datadir}"
    return cmd.format(**locals())
Esempio n. 17
0
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file, base_dir, config):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar("bcbio.variation",
                                  config_utils.get_program("bcbio_variation", config, "dir"))
    resources = config_utils.get_resources("bcbio_variation", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-ensemble", config_file,
                                             ref_file, out_file] + vrn_files
    with utils.chdir(base_dir):
        do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
Esempio n. 18
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --output-vcf --variants "
           "> {out_file}")
    subprocess.check_call(cmd.format(**locals()), shell=True)
Esempio n. 19
0
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file, base_dir, config):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar("bcbio.variation",
                                  config_utils.get_program("bcbio_variation", config, "dir"))
    resources = config_utils.get_resources("bcbio_variation", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-ensemble", config_file,
                                             ref_file, out_file] + vrn_files
    with utils.chdir(base_dir):
        do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
Esempio n. 20
0
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position.
    """
    out_file = "%s-prep%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        bv_jar = config_utils.get_jar("bcbio.variation",
                                      config_utils.get_program("bcbio_variation", data["config"], "dir"))
        resources = config_utils.get_resources("bcbio_variation", data["config"])
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        cmd = ["java"] + jvm_opts + ["-jar", bv_jar, "variant-utils", "sort-vcf",
                                     vcf_file, dd.get_ref_file(data), "--sortpos"]
        do.run(cmd, "Sort VCF by reference")
    return out_file
Esempio n. 21
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("cat {mpfile} "
                   "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                   "  --vcf-sample-list {sample_list} --output-vcf --variants "
                   "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Esempio n. 22
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = 1000
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    with open(out_file, "w") as out_handle:
        mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions,
                                        want_bcf=False)
        varscan = sh.Command("java").bake("-jar", varscan_jar, "mpileup2cns",
                                          "--min-coverage", "5",
                                          "--p-value", "0.98",
                                          "--output-vcf", "--variants", _out=out_handle)
        varscan(mpileup())
Esempio n. 23
0
def get_cmd(cmd_name, datadir, config):
    """Retrieve snpEff base command line, handling command line and jar based installs.
    """
    resources = config_utils.get_resources("snpeff", config)
    memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]))
    try:
        snpeff = config_utils.get_program("snpEff", config)
        cmd = "{snpeff} {memory} {cmd_name} -dataDir {datadir}"
    except config_utils.CmdNotFound:
        snpeff_jar = config_utils.get_jar(
            "snpEff", config_utils.get_program("snpeff", config, "dir"))
        config_file = "%s.config" % os.path.splitext(snpeff_jar)[0]
        cmd = "java {memory} -jar {snpeff_jar} {cmd_name} -c {config_file} -dataDir {datadir}"
    return cmd.format(**locals())
Esempio n. 24
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        cmd = (
            "cat {mpfile} "
            "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
            "  --vcf-sample-list {sample_list} --output-vcf --variants "
            "> {out_file}")
        do.run(cmd.format(**locals()), "Varscan", None,
               [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line)
Esempio n. 25
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar(
        "bcbio.variation",
        config_utils.get_program("bcbio_variation", data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + [
        "-jar", bv_jar, "variant-compare", config_file
    ]
    do.run(cmd, "Comparing variant calls using bcbio.variation", data)
    subprocess.check_call(cmd)
Esempio n. 26
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --output-vcf --variants "
           "> {out_file}")
    subprocess.check_call(cmd.format(**locals()), shell=True)
Esempio n. 27
0
 def _get_jar(self, command, alts=None):
     """Retrieve the jar for running the specified command.
     """
     dirs = []
     for bdir in [self._gatk_dir, self._picard_ref]:
         dirs.extend([bdir, os.path.join(bdir, os.pardir, "gatk")])
     if alts is None: alts = []
     for check_cmd in [command] + alts:
         for dir_check in dirs:
             try:
                 check_file = config_utils.get_jar(command, dir_check)
                 return check_file
             except ValueError:
                 pass
     raise ValueError("Could not find jar %s in %s:%s" %
                      (command, self._picard_ref, self._gatk_dir))
Esempio n. 28
0
 def _get_jar(self, command, alts=None):
     """Retrieve the jar for running the specified command.
     """
     dirs = []
     for bdir in [self._gatk_dir, self._picard_ref]:
         dirs.extend([bdir,
                      os.path.join(bdir, os.pardir, "gatk")])
     if alts is None: alts = []
     for check_cmd in [command] + alts:
         for dir_check in dirs:
             try:
                 check_file = config_utils.get_jar(command, dir_check)
                 return check_file
             except ValueError:
                 pass
     raise ValueError("Could not find jar %s in %s:%s" % (command, self._picard_ref, self._gatk_dir))
Esempio n. 29
0
 def _get_jar(self, command, alts=None):
     """Retrieve the jar for running the specified command.
     """
     dirs = []
     for bdir in [self._gatk_dir, self._picard_ref]:
         dirs.extend([bdir, os.path.join(bdir, os.pardir, "gatk")])
     if alts is None: alts = []
     for check_cmd in [command] + alts:
         for dir_check in dirs:
             try:
                 check_file = config_utils.get_jar(command, dir_check)
                 return check_file
             except ValueError, msg:
                 if str(msg).find("multiple") > 0:
                     raise
                 else:
                     pass
Esempio n. 30
0
 def _get_jar(self, command, alts=None):
     """Retrieve the jar for running the specified command.
     """
     dirs = []
     for bdir in [self._gatk_dir, self._picard_ref]:
         dirs.extend([bdir,
                      os.path.join(bdir, os.pardir, "gatk")])
     if alts is None: alts = []
     for check_cmd in [command] + alts:
         for dir_check in dirs:
             try:
                 check_file = config_utils.get_jar(command, dir_check)
                 return check_file
             except ValueError, msg:
                 if str(msg).find("multiple") > 0:
                     raise
                 else:
                     pass
Esempio n. 31
0
def _freebayes_custom(in_file, ref_file, config):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.
    """
    bv_ver = programs.get_version("bcbio.variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar("bcbio.variation",
                                      config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes",
                                                 in_file, ref_file]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Esempio n. 32
0
def _run_snpeff(snp_in, genome, se_interval, out_format, config):
    snpeff_jar = config_utils.get_jar("snpEff",
                                      config_utils.get_program("snpEff", config, "dir"))
    config_file = "%s.config" % os.path.splitext(snpeff_jar)[0]
    resources = config_utils.get_resources("snpEff", config)
    ext = "vcf" if out_format == "vcf" else "tsv"
    out_file = "%s-effects.%s" % (os.path.splitext(snp_in)[0], ext)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", [])
        cl += ["-jar", snpeff_jar, "eff", "-c", config_file,
               "-1", "-i", "vcf", "-o", out_format, genome, snp_in]
        if se_interval:
            cl.extend(["-filterInterval", se_interval])
        print " ".join(cl)
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                subprocess.check_call(cl, stdout=out_handle)
    return out_file
Esempio n. 33
0
def _run_snpeff(snp_in, genome, se_interval, out_format, config):
    snpeff_jar = config_utils.get_jar("snpEff",
                                      config_utils.get_program("snpEff", config, "dir"))
    config_file = "%s.config" % os.path.splitext(snpeff_jar)[0]
    resources = config_utils.get_resources("snpEff", config)
    ext = "vcf" if out_format == "vcf" else "tsv"
    out_file = "%s-effects.%s" % (os.path.splitext(snp_in)[0], ext)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", [])
        cl += ["-jar", snpeff_jar, "-c", config_file,
               "-1", "-i", "vcf", "-o", out_format, genome, snp_in]
        if se_interval:
            cl.extend(["-filterInterval", se_interval])
        print " ".join(cl)
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                subprocess.check_call(cl, stdout=out_handle)
    return out_file
Esempio n. 34
0
def run(data):
    #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file
    config = data["config"]
    input_type, input_dir, input_file = _get_input_para(data)
    out_file = os.path.join(input_dir, 'oncofuse_out.txt')
    oncofuse_jar = config_utils.get_jar("Oncofuse",
                                      config_utils.get_program("oncofuse",
                                                               config, "dir"))

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file]
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                cmd = " ".join(cl)
                do.run(cmd, "oncofuse fusion detection", data)
    return out_file
Esempio n. 35
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = 1000
    varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir"))
    with open(out_file, "w") as out_handle:
        mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False)
        varscan = sh.Command("java").bake(
            "-jar",
            varscan_jar,
            "mpileup2cns",
            "--min-coverage",
            "5",
            "--p-value",
            "0.98",
            "--output-vcf",
            "--variants",
            _out=out_handle,
        )
        varscan(mpileup())
Esempio n. 36
0
def _run_bcbio_variation(config_file, base_dir, sample, data):
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(sample))
    out_bed_file = os.path.join(base_dir, "{0}-callregions.bed".format(sample))
    if not utils.file_exists(out_vcf_file):
        bv_jar = config_utils.get_jar("bcbio.variation",
                                      config_utils.get_program("bcbio_variation",
                                                               data["config"], "dir"))
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        subprocess.check_call(["java"] + java_args + ["-jar", bv_jar, "variant-compare", config_file])
        base_vcf = glob.glob(os.path.join(base_dir, sample, "work", "prep",
                                          "*-cfilter.vcf"))[0]
        base_bed = glob.glob(os.path.join(base_dir, sample, "work", "prep",
                                          "*-multicombine.bed"))[0]
        os.symlink(base_vcf, out_vcf_file)
        os.symlink(base_bed, out_bed_file)

    return {"variantcaller": "ensemble",
            "vrn_file": out_vcf_file,
            "bed_file": out_bed_file}
Esempio n. 37
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config")
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file,
                                             out_file, "-c", str(config["algorithm"]["num_cores"])]
    do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 38
0
def _freebayes_custom(in_file, ref_file, config):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.
    """
    bv_ver = programs.get_version("bcbio.variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(
            os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar(
            "bcbio.variation",
            config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + [
            "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file
        ]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Esempio n. 39
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar(
            "bcbio.coverage",
            config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning(
            "No coverage calculations: Did not find bcbio.coverage jar from system config"
        )
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file),
                                              "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {
        "direction": "increase",
        "magnitude": config["algorithm"].get("num_cores", 1)
    }
    jvm_opts = config_utils.adjust_opts(
        resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = [
                "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"
            ]
            cmd = ["java"] + jvm_opts + java_args + [
                "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c",
                str(config["algorithm"].get("num_cores", 1))
            ]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 40
0
def run(data):
    #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == 'GRCh37':  #assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ['star']:
            input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ['tophat', 'tophat2']:
            input_file = _fix_tophat_junction_output(input_file)

    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None

    out_file = os.path.join(input_dir, 'oncofuse_out.txt')

    if file_exists(out_file):
        return out_file

    oncofuse_jar = config_utils.get_jar(
        "Oncofuse", config_utils.get_program("oncofuse", config, "dir"))

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        cl += [
            "-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file
        ]
        with open(out_file, "w") as out_handle:
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                return out_file
    return out_file
Esempio n. 41
0
def run(data):
    #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == 'GRCh37': #assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ['star']:
            input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ['tophat', 'tophat2']:
            input_file = _fix_tophat_junction_output(input_file)
    
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    
    out_file = os.path.join(input_dir, 'oncofuse_out.txt')
    
    if file_exists(out_file):
        return out_file
    
    oncofuse_jar = config_utils.get_jar("Oncofuse",
                                      config_utils.get_program("oncofuse",
                                                               config, "dir"))

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = ["java"]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file]
        with open(out_file, "w") as out_handle:
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                return out_file
    return out_file
Esempio n. 42
0
def _run_bcbio_variation(config_file, base_dir, sample, data):
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(sample))
    out_bed_file = os.path.join(base_dir, "{0}-callregions.bed".format(sample))
    if not utils.file_exists(out_vcf_file):
        bv_jar = config_utils.get_jar(
            "bcbio.variation",
            config_utils.get_program("bcbio_variation", data["config"], "dir"))
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        subprocess.check_call(["java"] + java_args +
                              ["-jar", bv_jar, "variant-compare", config_file])
        base_vcf = glob.glob(
            os.path.join(base_dir, sample, "work", "prep", "*-cfilter.vcf"))[0]
        base_bed = glob.glob(
            os.path.join(base_dir, sample, "work", "prep",
                         "*-multicombine.bed"))[0]
        os.symlink(base_vcf, out_vcf_file)
        os.symlink(base_bed, out_bed_file)

    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": out_bed_file
    }
Esempio n. 43
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError(
            "Please install version 2.3.5 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = (
        "{mpileup} | {remove_zerocoverage} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        vcfutils.write_empty_vcf(out_file)
Esempio n. 44
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(config, mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                config,
                                                max_read_depth,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(config, indel_file,
                              snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                fix_ambig = vcfutils.fix_ambiguous_cl()
                tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0]
                tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0]
                varscan_cmd = (
                    "java {jvm_opts} -jar {varscan_jar} somatic"
                    " {normal_tmp_mpileup} {tumor_tmp_mpileup} "
                    "--output-snp {tx_snp_in} --output-indel {tx_indel_in} "
                    " --output-vcf --min-coverage 5 --p-value 0.98 "
                    "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(
                        utils.get_in(paired.tumor_config,
                                     ("algorithm", "min_allele_fraction"),
                                     10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)
                for orig_fname, fname in [(tx_snp_in, tx_snp),
                                          (tx_indel_in, tx_indel)]:
                    cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}"
                    do.run(cmd.format(**locals()), "Varscan paired fix")

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records
        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name,
                             config)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name,
                             config)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)
Esempio n. 45
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams(
        align_bams, items)

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None,
                   None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, normal_name, tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, normal_name, tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Esempio n. 46
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError(
            "Require both tumor and normal BAM files for VarScan cancer calling"
        )

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                max_read_depth,
                                                config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Esempio n. 47
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling")

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            with utils.curdir_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"),10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)