Beispiel #1
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError("Please install version 2.3.5 or better of VarScan with support "
                      "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config,
                                    target_regions=target_regions, want_bcf=False)
    cmd = ("{mpileup} "
           "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
           "  --vcf-sample-list {sample_list} --output-vcf --variants "
           "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None,
           [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Beispiel #2
0
def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError(
            "Please install version 2.3.5 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams,
                                    ref_file,
                                    max_read_depth,
                                    config,
                                    target_regions=target_regions,
                                    want_bcf=False)
    cmd = (
        "{mpileup} "
        "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
        "  --vcf-sample-list {sample_list} --output-vcf --variants "
        "> {out_file}")
    cmd = cmd.format(**locals())
    do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)])
    os.remove(sample_list)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
Beispiel #3
0
def shared_variantcall(call_fn,
                       name,
                       align_bams,
                       ref_file,
                       config,
                       dbsnp=None,
                       region=None,
                       out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if ((variant_regions is not None
             and not os.path.isfile(target_regions))
                or not all(has_aligned_reads(x, region) for x in align_bams)):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    return out_file
Beispiel #4
0
def _run_cortex_on_region(region, align_bam, ref_file, out_file_base, config):
    """Run cortex on a specified chromosome start/end region.
    """
    kmers = [31, 51, 71]
    min_reads = 1750
    cortex_dir = config["program"].get("cortex")
    stampy_dir = config["program"].get("stampy")
    vcftools_dir = config["program"].get("vcftools")
    if cortex_dir is None or stampy_dir is None:
        raise ValueError("cortex_var requires path to pre-built cortex and stampy")
    region_str = apply("{0}-{1}-{2}".format, region)
    base_dir = safe_makedir(os.path.join(os.path.dirname(out_file_base), region_str))
    out_vcf_base = os.path.join(base_dir, "{0}-{1}".format(
            os.path.splitext(os.path.basename(out_file_base))[0], region_str))
    out_file = "{0}.vcf".format(out_vcf_base)
    if not file_exists(out_file):
        fastq = _get_fastq_in_region(region, align_bam, out_vcf_base)
        if _count_fastq_reads(fastq, min_reads) < min_reads:
            write_empty_vcf(out_file)
        else:
            local_ref, genome_size = _get_local_ref(region, ref_file, out_vcf_base)
            indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers)
            cortex_out = _run_cortex(fastq, indexes, {"kmers": kmers, "genome_size": genome_size,
                                                      "sample": _get_sample_name(align_bam)},
                                     out_vcf_base, {"cortex": cortex_dir, "stampy": stampy_dir,
                                                    "vcftools": vcftools_dir},
                                     config)
            if cortex_out:
                _remap_cortex_out(cortex_out, region, out_file)
            else:
                write_empty_vcf(out_file)
    return out_file
Beispiel #5
0
def run_cortex(align_bams, ref_file, config, dbsnp=None, region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    if len(align_bams) == 1:
        align_bam = align_bams[0]
    else:
        raise NotImplementedError("Need to add multisample calling for cortex_var")
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if region is not None:
        work_dir = safe_makedir(os.path.join(os.path.dirname(out_file),
                                             region.replace(".", "_")))
    else:
        work_dir = os.path.dirname(out_file)
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only support regional variant calling with cortex_var: set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                                       ref_file, work_dir, out_file, config)
                                 for x in in_handle]

            combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file))
            _combine_variants(regional_vcfs, combine_file, ref_file, config)
            _select_final_variants(combine_file, out_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Beispiel #6
0
def run_samtools(align_bam,
                 ref_file,
                 config,
                 dbsnp=None,
                 region=None,
                 out_file=None):
    """Detect SNPs and indels with samtools mpileup and bcftools.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index", align_bam)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        logger.info("Genotyping with samtools: {region} {fname}".format(
            region=region, fname=os.path.basename(align_bam)))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if variant_regions is not None and not os.path.isfile(target_regions):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                _call_variants_samtools(align_bam, ref_file, config,
                                        target_regions, tx_out_file)
    return out_file
Beispiel #7
0
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if region is not None:
        work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_")))
    else:
        work_dir = os.path.dirname(out_file)
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only support regional variant calling with cortex_var: set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [
                    _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config)
                    for x in in_handle
                ]

            combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file))
            _combine_variants(regional_vcfs, combine_file, ref_file, config)
            _select_final_variants(combine_file, out_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Beispiel #8
0
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info(
            "Genotyping with {name}: {region} {fname}".format(
                name=name, region=region, fname=os.path.basename(align_bams[0])
            )
        )
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if (
            variant_regions is not None
            and isinstance(target_regions, basestring)
            and not os.path.isfile(target_regions)
        ) or not all(realign.has_aligned_reads(x, region) for x in align_bams):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions, tx_out_file)
    return out_file
Beispiel #9
0
def run_cortex(align_bam,
               ref_file,
               config,
               dbsnp=None,
               region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError(
                "Only regional variant calling with cortex_var is supported. Set variant_regions"
            )
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [
                    _run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                          ref_file, out_file, config)
                    for x in in_handle
                ]
            combine_variant_files(regional_vcfs, out_file, ref_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Beispiel #10
0
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base,
                          config):
    """Run cortex on a specified chromosome start/end region.
    """
    kmers = [31, 51, 71]
    min_reads = 1750
    cortex_dir = config["program"].get("cortex")
    stampy_dir = config["program"].get("stampy")
    vcftools_dir = config["program"].get("vcftools")
    if cortex_dir is None or stampy_dir is None:
        raise ValueError(
            "cortex_var requires path to pre-built cortex and stampy")
    region_str = apply("{0}-{1}-{2}".format, region)
    base_dir = safe_makedir(os.path.join(work_dir, region_str))
    try:
        out_vcf_base = os.path.join(
            base_dir, "{0}-{1}".format(
                os.path.splitext(os.path.basename(out_file_base))[0],
                region_str))
        out_file = os.path.join(
            work_dir, os.path.basename("{0}.vcf".format(out_vcf_base)))
        if not file_exists(out_file):
            fastq = _get_fastq_in_region(region, align_bam, out_vcf_base)
            if _count_fastq_reads(fastq, min_reads) < min_reads:
                write_empty_vcf(out_file)
            else:
                local_ref, genome_size = _get_local_ref(
                    region, ref_file, out_vcf_base)
                indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir,
                                           kmers)
                cortex_out = _run_cortex(
                    fastq, indexes, {
                        "kmers": kmers,
                        "genome_size": genome_size,
                        "sample": _get_sample_name(align_bam)
                    }, out_vcf_base, {
                        "cortex": cortex_dir,
                        "stampy": stampy_dir,
                        "vcftools": vcftools_dir
                    }, config)
                if cortex_out:
                    _remap_cortex_out(cortex_out, region, out_file)
                else:
                    write_empty_vcf(out_file)
    finally:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    return out_file
Beispiel #11
0
def run_samtools(align_bam, ref_file, config, dbsnp=None, region=None,
                 out_file=None):
    """Detect SNPs and indels with samtools mpileup and bcftools.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index", align_bam)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        logger.info("Genotyping with samtools: {region} {fname}".format(
            region=region, fname=os.path.basename(align_bam)))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if variant_regions is not None and not os.path.isfile(target_regions):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                _call_variants_samtools(align_bam, ref_file, config, target_regions,
                                        tx_out_file)
    return out_file
Beispiel #12
0
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only regional variant calling with cortex_var is supported. Set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                                       ref_file, out_file, config)
                                 for x in in_handle]
            combine_variant_files(regional_vcfs, out_file, ref_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Beispiel #13
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):

    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple)) and
            not all(has_aligned_reads(x, region) for x in align_bams)):

                write_empty_vcf(out_file)
                return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            try:
                broad_runner.run_mutect(params)
            except CalledProcessError as error:
                java_exception = _parse_gatk_java_error_string(error.cmd)
                #HACK: Currently MuTect bails out on certain small BAM files
                # Until the issue is fixed by Broad, this specific exception
                # will be ignored. All the other exceptions will be raised
                # correctly.
                if java_exception in _PASS_EXCEPTIONS:
                    write_empty_vcf(tx_out_file)
                    return
                else:
                    raise

    return out_file