Exemple #1
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = ("{mpileup} | {remove_zerocoverage} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
       write_empty_vcf(out_file)
Exemple #2
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Exemple #3
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError(
            "Please install version 2.3.5 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    cmd = (
        "{mpileup} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Exemple #4
0
def _create_pileup(bam_file, data, out_base, background):
    """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping.
    """
    out_file = "%s-mpileup.txt" % out_base
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            background_bed = os.path.normpath(
                os.path.join(
                    os.path.dirname(
                        os.path.realpath(utils.which("verifybamid2"))),
                    "resource", "%s.%s.%s.vcf.gz.dat.bed" %
                    (background["dataset"], background["nvars"],
                     background["build"])))
            local_bed = os.path.join(
                os.path.dirname(out_base), "%s.%s-hg19.bed" %
                (background["dataset"], background["nvars"]))
            if not utils.file_exists(local_bed):
                with file_transaction(data, local_bed) as tx_local_bed:
                    with open(background_bed) as in_handle:
                        with open(tx_local_bed, "w") as out_handle:
                            for line in in_handle:
                                out_handle.write("chr%s" % line)
            mpileup_cl = samtools.prep_mpileup([bam_file],
                                               dd.get_ref_file(data),
                                               data["config"],
                                               want_bcf=False,
                                               target_regions=local_bed)
            cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}")
            do.run(cl.format(**locals()), "Create pileup from BAM input")
    return out_file
Exemple #5
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = (
                "cat {mpfile} "
                "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                "  --vcf-sample-list {sample_list} --output-vcf --variants "
                "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Exemple #6
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --output-vcf --variants "
           "> {out_file}")
    subprocess.check_call(cmd.format(**locals()), shell=True)
Exemple #7
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("cat {mpfile} "
                   "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                   "  --vcf-sample-list {sample_list} --output-vcf --variants "
                   "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Exemple #8
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(
            utils.get_in(config,
                         ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = (
            "{export} {mpileup} | {remove_zerocoverage} | "
            "ifne varscan {jvm_opts} mpileup2cns {opts} "
            "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
            """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
            "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
            "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}"
        )
        do.run(cmd.format(**locals()), "Varscan", None,
               [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Exemple #9
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = 1000
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    with open(out_file, "w") as out_handle:
        mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions,
                                        want_bcf=False)
        varscan = sh.Command("java").bake("-jar", varscan_jar, "mpileup2cns",
                                          "--min-coverage", "5",
                                          "--p-value", "0.98",
                                          "--output-vcf", "--variants", _out=out_handle)
        varscan(mpileup())
Exemple #10
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --output-vcf --variants "
           "> {out_file}")
    subprocess.check_call(cmd.format(**locals()), shell=True)
Exemple #11
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = ("{export} {mpileup} | {remove_zerocoverage} | "
               "ifne varscan {jvm_opts} mpileup2cns {opts} "
               "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
               """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
               "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
               "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}")
        do.run(cmd.format(**locals()), "Varscan", None,
                [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Exemple #12
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = 1000
    varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir"))
    with open(out_file, "w") as out_handle:
        mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False)
        varscan = sh.Command("java").bake(
            "-jar",
            varscan_jar,
            "mpileup2cns",
            "--min-coverage",
            "5",
            "--p-value",
            "0.98",
            "--output-vcf",
            "--variants",
            _out=out_handle,
        )
        varscan(mpileup())
Exemple #13
0
def _get_input_args(bam_file, data, out_base):
    """Retrieve input args, depending on genome build.

    VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate
    a pileup for hg19 and fix chromosome naming.
    """
    if dd.get_genome_build(data) in ["hg19"]:
        out_file = "%s-mpileup.txt" % out_base
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                mpileup_cl = samtools.prep_mpileup(
                    [bam_file],
                    dd.get_ref_file(data),
                    data["config"],
                    want_bcf=False,
                    target_regions=_get_autosomal_bed(data, tx_out_file))
                cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}")
                do.run(cl.format(**locals()), "Create pileup from BAM input")
        return ["--PileupFile", out_file]
    else:
        return ["--BamFile", bam_file]
Exemple #14
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = (
        "{mpileup} | {remove_zerocoverage} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
def _create_pileup(bam_file, data, out_base, background):
    """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping.
    """
    out_file = "%s-mpileup.txt" % out_base
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            background_bed = os.path.normpath(os.path.join(
                os.path.dirname(os.path.realpath(utils.which("verifybamid2"))),
                "resource", "%s.%s.%s.vcf.gz.dat.bed" % (background["dataset"],
                                                         background["nvars"], background["build"])))
            local_bed = os.path.join(os.path.dirname(out_base),
                                     "%s.%s-hg19.bed" % (background["dataset"], background["nvars"]))
            if not utils.file_exists(local_bed):
                with file_transaction(data, local_bed) as tx_local_bed:
                    with open(background_bed) as in_handle:
                        with open(tx_local_bed, "w") as out_handle:
                            for line in in_handle:
                                out_handle.write("chr%s" % line)
            mpileup_cl = samtools.prep_mpileup([bam_file], dd.get_ref_file(data), data["config"], want_bcf=False,
                                                target_regions=local_bed)
            cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}")
            do.run(cl.format(**locals()), "Create pileup from BAM input")
    return out_file
Exemple #16
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]
    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not utils.file_exists(out_file):
        assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan"
        normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file,
                                                  config, max_read_depth,
                                                  target_regions=target_regions,
                                                  want_bcf=False)
        tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file,
                                                 config, max_read_depth,
                                                 target_regions=target_regions,
                                                 want_bcf=False)
        base, ext = utils.splitext_plus(out_file)
        indel_file = base + "-indel.vcf"
        snp_file = base + "-snp.vcf"
        with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'"
                varscan_cmd = ("varscan {jvm_opts} somatic "
                               " <({normal_mpileup_cl} | {remove_zerocoverage}) "
                               "<({tumor_mpileup_cl} | {remove_zerocoverage}) "
                               "--output-snp {tx_snp} --output-indel {tx_indel} "
                               " --output-vcf --min-coverage 5 --p-value 0.98 "
                               "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"), 10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        to_combine = []
        for fname in [snp_file, indel_file]:
            if utils.file_exists(fname):
                fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0])
                with file_transaction(config, fix_file) as tx_fix_file:
                    fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                    fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                    py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                    normal_name = paired.normal_name
                    tumor_name = paired.tumor_name
                    cmd = ("cat {fname} | "
                           "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x,"
                            """ "{normal_name}", "{tumor_name}")' | """
                           "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | "
                           """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """
                           "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | "
                           "bgzip -c > {tx_fix_file}")
                    do.run(cmd.format(**locals()), "Varscan paired fix")
                to_combine.append(fix_file)

        if not to_combine:
            out_file = write_empty_vcf(out_file, config)
        else:
            out_file = combine_variant_files(to_combine,
                                             out_file, ref_file, config,
                                             region=target_regions)
        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
        if out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)
Exemple #17
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling")

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            with utils.curdir_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"),10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)
Exemple #18
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError(
            "Require both tumor and normal BAM files for VarScan cancer calling"
        )

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                max_read_depth,
                                                config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Exemple #19
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams(
        align_bams, items)

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None,
                   None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, normal_name, tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, normal_name, tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Exemple #20
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(config, mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                config,
                                                max_read_depth,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(config, indel_file,
                              snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                fix_ambig = vcfutils.fix_ambiguous_cl()
                tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0]
                tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0]
                varscan_cmd = (
                    "java {jvm_opts} -jar {varscan_jar} somatic"
                    " {normal_tmp_mpileup} {tumor_tmp_mpileup} "
                    "--output-snp {tx_snp_in} --output-indel {tx_indel_in} "
                    " --output-vcf --min-coverage 5 --p-value 0.98 "
                    "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(
                        utils.get_in(paired.tumor_config,
                                     ("algorithm", "min_allele_fraction"),
                                     10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)
                for orig_fname, fname in [(tx_snp_in, tx_snp),
                                          (tx_indel_in, tx_indel)]:
                    cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}"
                    do.run(cmd.format(**locals()), "Varscan paired fix")

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records
        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name,
                             config)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name,
                             config)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)