Ejemplo n.º 1
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Ejemplo n.º 2
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = ("{mpileup} | {remove_zerocoverage} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
       write_empty_vcf(out_file)
Ejemplo n.º 3
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError(
            "Please install version 2.3.5 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    cmd = (
        "{mpileup} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Ejemplo n.º 4
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = (
                "cat {mpfile} "
                "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                "  --vcf-sample-list {sample_list} --output-vcf --variants "
                "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Ejemplo n.º 5
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("cat {mpfile} "
                   "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                   "  --vcf-sample-list {sample_list} --output-vcf --variants "
                   "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Ejemplo n.º 6
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    config,
                                    max_read_depth,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(
            utils.get_in(config,
                         ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = (
            "{export} {mpileup} | {remove_zerocoverage} | "
            "ifne varscan {jvm_opts} mpileup2cns {opts} "
            "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
            """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
            "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
            "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}"
        )
        do.run(cmd.format(**locals()), "Varscan", None,
               [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Ejemplo n.º 7
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }"
    # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise
    # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html
    with tx_tmpdir(items[0]) as tmp_dir:
        jvm_opts = _get_jvm_opts(config, tmp_dir)
        opts = " ".join(_varscan_options_from_config(config))
        min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
        fix_ambig_ref = vcfutils.fix_ambiguous_cl()
        fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
        py_cl = os.path.join(os.path.dirname(sys.executable), "py")
        export = utils.local_path_export()
        cmd = ("{export} {mpileup} | {remove_zerocoverage} | "
               "ifne varscan {jvm_opts} mpileup2cns {opts} "
               "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | "
               """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """
               "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | "
               "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}")
        do.run(cmd.format(**locals()), "Varscan", None,
                [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Ejemplo n.º 8
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """

    config = items[0]["config"]

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    jvm_opts = _get_varscan_opts(config)
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    cmd = (
        "{mpileup} | {remove_zerocoverage} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Ejemplo n.º 9
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling")

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            with utils.curdir_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"),10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)
Ejemplo n.º 10
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError(
            "Require both tumor and normal BAM files for VarScan cancer calling"
        )

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                max_read_depth,
                                                config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Ejemplo n.º 11
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams(
        align_bams, items)

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None,
                   None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, normal_name, tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, normal_name, tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Ejemplo n.º 12
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(config, mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                config,
                                                max_read_depth,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(config, indel_file,
                              snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                fix_ambig = vcfutils.fix_ambiguous_cl()
                tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0]
                tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0]
                varscan_cmd = (
                    "java {jvm_opts} -jar {varscan_jar} somatic"
                    " {normal_tmp_mpileup} {tumor_tmp_mpileup} "
                    "--output-snp {tx_snp_in} --output-indel {tx_indel_in} "
                    " --output-vcf --min-coverage 5 --p-value 0.98 "
                    "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(
                        utils.get_in(paired.tumor_config,
                                     ("algorithm", "min_allele_fraction"),
                                     10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)
                for orig_fname, fname in [(tx_snp_in, tx_snp),
                                          (tx_indel_in, tx_indel)]:
                    cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}"
                    do.run(cmd.format(**locals()), "Varscan paired fix")

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records
        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name,
                             config)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name,
                             config)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)