예제 #1
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file,
                                  todo="square"):
    """Run squaring or merging analysis using bcbio.variation.recall.
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall", data["config"])
    # adjust memory by cores but leave room for run program memory
    memcores = int(math.ceil(float(cores) / 5.0))
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]),
                                        {"algorithm": {"memory_adjust": {"direction": "increase",
                                                                         "magnitude": memcores}}})
    # Write unique VCFs and BAMs to input file
    input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0]
    with open(input_file, "w") as out_handle:
        out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n")
        if todo == "square":
            out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n")
    variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "")
    cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)]
    if todo == "square":
        cmd += ["--caller", variantcaller]
    cmd += [out_file, ref_file, input_file]
    do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)))
    return out_file
예제 #2
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    picard_runner = broad.runner_from_path("picard", config)
    picard_runner.run_fn("picard_index_ref", ref_file)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
예제 #3
0
def _add_region_params(region, out_file, items, gatk_type):
    """Add parameters for selecting by region to command line.
    """
    params = []
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += gatk.standard_cl_params(items)
    return params
예제 #4
0
파일: gatk.py 프로젝트: Kisun/bcbio-nextgen
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    # GATK can only downsample to a minimum of 200
    coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000))
    coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4)
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth_min < 4 else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file, items)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", str(coverage_depth_max),
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
예제 #5
0
def _config_params(base_config, assoc_files, region, out_file, items):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    # Output quality scores
    if "--enable_qscore_output" not in params:
        params.append("--enable_qscore_output")
    # drf not currently supported in MuTect to turn off duplicateread filter
    # params += gatk.standard_cl_params(items)
    return params
예제 #6
0
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                # Recent GATK 3.x versions also have race conditions with multiple
                # threads, so limit to 1 and keep memory available
                # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs
                # params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale, parallel_gc=True)
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #7
0
def _config_params(base_config, assoc_files, region, out_file):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = base_config["algorithm"].get("variant_regions")
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    return params
예제 #8
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
예제 #9
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine multiple VCF files into a single output file.

    Handles complex merging of samples and other tricky issues using GATK.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    broad_runner = broad.runner_from_config(config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, orig_file in enumerate(orig_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), orig_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            variant_regions = config["algorithm"].get("variant_regions", None)
            cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
            if cur_region:
                params += ["-L", bamprep.region_to_gatk(cur_region),
                           "--interval_set_rule", "INTERSECTION"]
            broad_runner.run_gatk(params)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #10
0
def prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=None, want_bcf=True):
    cl = [
        config_utils.get_program("samtools", config),
        "mpileup",
        "-f",
        ref_file,
        "-d",
        str(max_read_depth),
        "-L",
        str(max_read_depth),
        "-m",
        "3",
        "-F",
        "0.0002",
    ]
    if want_bcf:
        cl += ["-D", "-S", "-u"]
    if target_regions:
        str_regions = bamprep.region_to_gatk(target_regions)
        if os.path.isfile(str_regions):
            cl += ["-l", str_regions]
        else:
            cl += ["-r", str_regions]
    cl += align_bams
    return " ".join(cl)
예제 #11
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", "250",
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
예제 #12
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
예제 #13
0
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Preparation work for SomaticIndelDetector.
    """
    base_config = items[0]["config"]
    for x in align_bams:
        bam.index(x, base_config)

    params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"]
    # Limit per base read start count to between 200-10000, i.e. from any base
    # can no more 10000 new reads begin.
    # Further, limit maxNumberOfReads accordingly, otherwise SID discards
    # windows for high coverage panels.
    window_size = 200  # default SID value
    paired = vcfutils.get_paired_bams(align_bams, items)
    max_depth = min(max(200, get_in(paired.tumor_config,
                                    ("algorithm", "coverage_depth_max"), 10000)), 10000)
    params += ["--downsample_to_coverage", max_depth]
    params += ["--maxNumberOfReads", str(int(max_depth) * window_size)]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        # notice there must be at least 4 reads of coverage in normal
        params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af]
    else:
        params += ["--unpaired"]
        params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    return params
예제 #14
0
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #15
0
def read_backed_phasing(vcf_file, bam_files, genome_file, region, config):
    """Phase variants using GATK's read-backed phasing.
    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_phasing_ReadBackedPhasing.html
    """
    if has_variants(vcf_file):
        broad_runner = broad.runner_from_config(config)
        out_file = "%s-phased%s" % os.path.splitext(vcf_file)
        if not file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                params = ["-T", "ReadBackedPhasing",
                          "-R", genome_file,
                          "--variant", vcf_file,
                          "--out", tx_out_file,
                          "--downsample_to_coverage", "250",
                          "--downsampling_type", "BY_SAMPLE"]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                variant_regions = config["algorithm"].get("variant_regions", None)
                region = shared.subset_variant_regions(variant_regions, region, out_file)
                if region:
                    params += ["-L", bamprep.region_to_gatk(region),
                               "--interval_set_rule", "INTERSECTION"]
                broad_runner.run_gatk(params)
        return out_file
    else:
        return vcf_file
예제 #16
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """
    Preparation work for MuTect.
    """

    #FIXME: We assume all other bits in the config are shared

    base_config = items[0]["config"]
    dbsnp = assoc_files["dbsnp"]
    cosmic = assoc_files.get("cosmic")

    broad_runner = broad.runner_from_config(base_config, "mutect")

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)

    variant_regions = base_config["algorithm"].get("variant_regions", None)
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    region = subset_variant_regions(variant_regions, region, out_file)

    #FIXME: Add more parameters like fraction contamination etc

    params = ["-R", ref_file, "-T", "MuTect"]
    params += ["--dbsnp", dbsnp]

    tumor_bam = None
    normal_bam = None

    for bamfile, item in itertools.izip(align_bams, items):

        metadata = item["metadata"]

        if metadata["phenotype"] == "normal":
            normal_bam = bamfile
            normal_sample_name = item["name"][1]
        elif metadata["phenotype"] == "tumor":
            tumor_bam = bamfile
            tumor_sample_name = item["name"][1]

    if tumor_bam is None or normal_bam is None:
        raise ValueError("Missing phenotype definition (tumor or normal) "
                         "in samples")

    params += ["-I:normal", normal_bam]
    params += ["-I:tumor", tumor_bam]
    params += ["--tumor_sample_name", tumor_sample_name]
    params += ["--normal_sample_name", normal_sample_name]
    params += ["--fraction_contamination", contamination]

    if cosmic is not None:
        params += ["--cosmic", cosmic]

    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]

    return broad_runner, params
예제 #17
0
def _add_region_params(region, out_file, items):
    """Add parameters for selecting by region to command line.
    """
    params = []
    variant_regions = tz.get_in(["config", "algorithm", "variant_regions"], items[0])
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return params
예제 #18
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    target = pshared.subset_variant_regions(variant_regions, region, base_file, items)
    if isinstance(target, basestring) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
예제 #19
0
def _subset_regions(region, base_file, items):
    """Subset to a BED file (or genomic region) for calling.
    """
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    target = pshared.subset_variant_regions(variant_regions, region, base_file, items)
    if isinstance(target, six.string_types) and os.path.isfile(target):
        return target
    else:
        return bamprep.region_to_gatk(target)
예제 #20
0
def subset_vcf(in_file, region, out_file, config):
    """Subset VCF in the given region, handling bgzip and indexing of input.
    """
    work_file = vcfutils.bgzip_and_index(in_file, config)
    if not file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bcftools = config_utils.get_program("bcftools", config)
            region_str = bamprep.region_to_gatk(region)
            cmd = "{bcftools} view -r {region_str} {work_file} > {tx_out_file}"
            do.run(cmd.format(**locals()), "subset %s: %s" % (os.path.basename(work_file), region_str))
    return out_file
예제 #21
0
def _get_interval(variant_regions, region, out_file, items):
    """Retrieve interval to run analysis in. Handles no targets, BED and regions
    """
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            return "--interval %s" % target
        else:
            return "--interval %s" % bamprep.region_to_gatk(target)
    else:
        return ""
예제 #22
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = config_utils.get_program("vardict", config)
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
                vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                cmd = ("{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #23
0
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
예제 #24
0
def _get_interval(variant_regions, region, out_file, items):
    """Retrieve interval to run analysis in. Handles no targets, BED and regions

    region can be a single region or list of multiple regions for multicore calling.
    """
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, six.string_types) and os.path.isfile(target):
            return "--interval %s" % target
        else:
            return "--interval %s" % bamprep.region_to_gatk(target)
    else:
        return ""
예제 #25
0
def run_gvcftyper(vrn_files, out_file, region, data):
    """Produce joint called variants from input gVCF files.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            license = license_export(data)
            ref_file = dd.get_ref_file(data)
            input_files = " ".join(vrn_files)
            region = bamprep.region_to_gatk(region)
            cmd = ("{license}sentieon driver -r {ref_file} --interval {region} "
                   "--algo GVCFtyper {tx_out_file} {input_files}")
            do.run(cmd.format(**locals()), "Sentieon GVCFtyper")
    return out_file
예제 #26
0
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
예제 #27
0
def _run_wham_genotype(in_file, all_bams, coords, data):
    """Run genotyping on a prepped, merged VCF file.
    """
    out_file = "%s-wgts%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cores = dd.get_cores(data)
            ref_file = dd.get_ref_file(data)
            coord_str = bamprep.region_to_gatk(coords)
            cmd = ("WHAM-GRAPHENING -b {in_file} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotype WHAM: %s" % region.to_safestr(coords))
    return out_file
예제 #28
0
def _bed_to_platypusin(region, base_file):
    """Convert BED file regions into Platypus custom inputs.
    """
    if isinstance(region, basestring) and os.path.isfile(region):
        out_file = "%s-platypusregion.list" % utils.splitext_plus(base_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    for region in pybedtools.BedTool(region):
                        out_handle.write("%s:%s-%s\n" % (region.chrom, region.start, region.stop))
        return out_file
    else:
        return bamprep.region_to_gatk(region)
예제 #29
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get("variant_regions", None)
                cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
                if cur_region:
                    params += ["-L", bamprep.region_to_gatk(cur_region),
                               "--interval_set_rule", "INTERSECTION"]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #30
0
def merge_gvcfs(data, region, vrn_files, out_file):
    """Simple merging of gVCFs with gvcftools.

    merge_variants does appear to work correctly, so we remove gVCF parts
    with extract_variants and then combine the merged samples together.

    Longer term we plan to replace this with
    agg (https://github.com/Illumina/agg) or
    GLnexus (https://github.com/dnanexus-rnd/GLnexus).
    """
    if not utils.file_exists(out_file):
        region = bamprep.region_to_gatk(region)
        vcfutils.merge_variant_files([_extract_variants_from_gvcf(f, region, out_file, data) for f in vrn_files],
                                     out_file, dd.get_ref_file(data), data["config"], region)
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #31
0
def _SID_call_prep(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Preparation work for SomaticIndelDetector.
    """
    base_config = items[0]["config"]
    for x in align_bams:
        bam.index(x, base_config)

    params = [
        "-R", ref_file, "-T", "SomaticIndelDetector", "-U",
        "ALLOW_N_CIGAR_READS"
    ]
    # Limit per base read start count to between 200-10000, i.e. from any base
    # can no more 10000 new reads begin.
    # Further, limit maxNumberOfReads accordingly, otherwise SID discards
    # windows for high coverage panels.
    paired = vcfutils.get_paired_bams(align_bams, items)
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    min_af = float(
        get_in(paired.tumor_config,
               ("algorithm", "min_allele_fraction"), 10)) / 100.0
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        # notice there must be at least 4 reads of coverage in normal
        params += [
            "--filter_expressions",
            "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af
        ]
    else:
        params += ["--unpaired"]
        params += [
            "--filter_expressions",
            "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af
        ]
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return params
예제 #32
0
def _shared_gatk_call_prep(align_bams,
                           items,
                           ref_file,
                           dbsnp,
                           region,
                           out_file,
                           num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        bam.index(x, config)
    if num_cores > 1 and broad_runner.gatk_type() == "gatk4":
        # dbSNP annotation not currently supported with Multiple cores
        dbsnp = None
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    params += standard_cl_params(items)
    return broad_runner, params
예제 #33
0
파일: mutect.py 프로젝트: zeneofa/bcbio
def _config_params(base_config, assoc_files, region, out_file):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    params += ["--fraction_contamination", contamination]
    dbsnp = assoc_files["dbsnp"]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = base_config["algorithm"].get("variant_regions")
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]
    return params
예제 #34
0
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
예제 #35
0
def _vardict_options_from_config(items, config, out_file, region=None, do_merge=False):
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    #["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    # "-k", "3", "-r", "4", "-m", "8"]

    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += resources["options"]

    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, do_merge=do_merge)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            opts += [target]  # this must be the last option
        else:
            # one-based, end-inclusive coordinates as for Gatk
            opts += ["-R", bamprep.region_to_gatk(target)]
    return opts
예제 #36
0
def _run_wham_coords(inputs, background_bams, coords, final_file):
    """Run WHAM on a specific set of chromosome, start, end coordinates.
    """
    base, ext = os.path.splitext(final_file)
    out_file = "%s-%s%s" % (base, region.to_safestr(coords), ext)
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            all_bams = ",".join([x["align_bam"]
                                 for x in inputs] + background_bams)
            coord_str = bamprep.region_to_gatk(coords)
            opts = "-k -m 30"
            cmd = (
                "WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                "> {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Run WHAM: %s" % region.to_safestr(coords))
    return [[coords, out_file]]
예제 #37
0
def _run_wham_coords(inputs, background_bams, coords, final_file):
    """Run WHAM on a specific set of chromosome, start, end coordinates.
    """
    base, ext = utils.splitext_plus(final_file)
    raw_file = "%s-%s.vcf" % (base, region.to_safestr(coords))
    all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams)
    if not utils.file_exists(raw_file):
        with file_transaction(inputs[0], raw_file) as tx_raw_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            coord_str = bamprep.region_to_gatk(coords)
            opts = "-k -m 30"
            cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_raw_file}")
            do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords))
    merge_vcf = _run_wham_merge(raw_file, inputs[0])
    gt_vcf = _run_wham_genotype(merge_vcf, all_bams, coords, inputs[0])
    prep_vcf = vcfutils.sort_by_ref(gt_vcf, inputs[0])
    return [[coords, prep_vcf]]
예제 #38
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    # GATK can only downsample to a minimum of 200
    coverage_depth_max = max(
        200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000))
    coverage_depth_min = utils.get_in(config,
                                      ("algorithm", "coverage_depth_min"), 4)
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth_min < 4 else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file, items)

    params = [
        "-R",
        ref_file,
        "--standard_min_confidence_threshold_for_calling",
        confidence,
        "--standard_min_confidence_threshold_for_emitting",
        confidence,
        "--downsample_to_coverage",
        str(coverage_depth_max),
        "--downsampling_type",
        "BY_SAMPLE",
    ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return broad_runner, params
예제 #39
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = [
                    "-T", "GenomicsDBImport", "--reader-threads",
                    str(cores), "--genomicsdb-workspace-path",
                    os.path.relpath(out_dir, os.getcwd()), "-L",
                    bamprep.region_to_gatk(region)
                ]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                samplemap = _create_samplemap_file(vrn_files)
                params += ["--sample-name-map", samplemap]
                # For large inputs, reduce memory usage by batching
                # https://github.com/bcbio/bcbio-nextgen/issues/2852
                if len(vrn_files) > 200:
                    params += ["--batch-size", "50"]
                memscale = {
                    "magnitude": 0.9 * cores,
                    "direction": "increase"
                } if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
예제 #40
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        bam.index(x, config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params += ["-I:tumor", paired.tumor_bam]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
예제 #41
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    if dd.is_set_coverage_depth_max(data):
        coverage_depth_max = dd.get_coverage_depth_max(data)
        # GATK can only downsample to a minimum of 200
        coverage_depth_max = max([200, coverage_depth_max])
        params += [
            "--downsample_to_coverage",
            str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE"
        ]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence,
            "--standard_min_confidence_threshold_for_emitting", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return broad_runner, params
예제 #42
0
def run_gvcfgenotyper(data, orig_region, vrn_files, out_file):
    """Merge strelka2 and Illumina compatible gVCFs with gvcfgenotyper.

    https://github.com/Illumina/gvcfgenotyper

    Also need to explore GLnexus (https://github.com/dnanexus-rnd/GLnexus)
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            regions = _find_gvcf_blocks(vrn_files[0], bamprep.region_to_gatk(orig_region),
                                        os.path.dirname(tx_out_file))
            if len(regions) == 1:
                _run_gvcfgenotyper(data, regions[0], vrn_files, tx_out_file)
            else:
                split_outs = [_run_gvcfgenotyper(data, r, vrn_files,
                                                 "%s-%s.vcf.gz" % (utils.splitext_plus(out_file)[0],
                                                                   r.replace(":", "_").replace("-", "_")))
                              for r in regions]
                vcfutils.concat_variant_files(split_outs, tx_out_file, regions,
                                              dd.get_ref_file(data), data["config"])
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #43
0
def prep_mpileup(align_bams,
                 ref_file,
                 config,
                 max_read_depth=None,
                 target_regions=None,
                 want_bcf=True):
    cl = [
        config_utils.get_program("samtools", config), "mpileup", "-f", ref_file
    ]
    if max_read_depth:
        cl += ["-d", str(max_read_depth), "-L", str(max_read_depth)]
    if want_bcf:
        cl += ["-t", "DP", "-u", "-g"]
    if target_regions:
        str_regions = bamprep.region_to_gatk(target_regions)
        if os.path.isfile(str_regions):
            cl += ["-l", str_regions]
        else:
            cl += ["-r", str_regions]
    cl += align_bams
    return " ".join(cl)
예제 #44
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = [
                "-T", "GenotypeGVCFs", "--variant",
                "gendb://%s" % genomics_db, "-R",
                dd.get_ref_file(data), "--output", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #45
0
파일: vcfutils.py 프로젝트: zeneofa/bcbio
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            variant_regions = config["algorithm"].get("variant_regions", None)
            cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
            if cur_region:
                params += ["-L", bamprep.region_to_gatk(cur_region),
                           "--interval_set_rule", "INTERSECTION"]
            jvm_opts = broad.get_gatk_framework_opts(config)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #46
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file)

    params = [
        "-R",
        ref_file,
        "--standard_min_confidence_threshold_for_calling",
        confidence,
        "--standard_min_confidence_threshold_for_emitting",
        confidence,
        "--downsample_to_coverage",
        "250",
        "--downsampling_type",
        "BY_SAMPLE",
    ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return broad_runner, params
예제 #47
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.
            ropts += [str(x) for x in resources.get("options", [])]

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = ["-T", "GenotypeGVCFs",
                      "--variant", "gendb://%s" % genomics_db,
                      "-R", dd.get_ref_file(data),
                      "--output", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            params += ["-ploidy", str(ploidy.get_ploidy([data], region))]
            # Avoid slow genotyping runtimes with improved quality score calculation in GATK4
            # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1
            resources = config_utils.get_resources("gatk", data["config"])
            params += [str(x) for x in resources.get("options", [])]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #48
0
def _config_params(base_config, assoc_files, region, out_file, items):
    """Add parameters based on configuration variables, associated files and genomic regions.
    """
    params = []
    dbsnp = assoc_files.get("dbsnp")
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    cosmic = assoc_files.get("cosmic")
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    # set low frequency calling parameter if adjusted
    # to set other MuTect parameters on contamination, pass options to resources for mutect
    # --fraction_contamination --minimum_normal_allele_fraction
    min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config)
    if min_af:
        params += [
            "--minimum_mutation_cell_fraction",
            "%.2f" % (min_af / 100.0)
        ]
    resources = config_utils.get_resources("mutect", base_config)
    if resources.get("options") is not None:
        params += [str(x) for x in resources.get("options", [])]
    # Output quality scores
    if "--enable_qscore_output" not in params:
        params.append("--enable_qscore_output")
    # drf not currently supported in MuTect to turn off duplicateread filter
    # params += gatk.standard_cl_params(items)
    return params
예제 #49
0
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data,
                                    {})
            if not assoc_files: assoc_files = {}
            params = [
                "-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                params += ["-nt", str(cores)]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
예제 #50
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir):
        with file_transaction(data, out_dir) as tx_out_dir:
            broad_runner = broad.runner_from_config(data["config"])
            cores = dd.get_cores(data)
            params = [
                "-T", "GenomicsDBImport", "--readerThreads",
                str(cores), "--genomicsDBWorkspace", tx_out_dir, "-L",
                bamprep.region_to_gatk(region)
            ]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
예제 #51
0
def combine_variant_files(orig_files,
                          out_file,
                          ref_file,
                          config,
                          quiet_out=True,
                          region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index,
                                        [[x, config] for x in exist_files],
                                        config)
            params = [
                "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file
            ]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(
                    ["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(
                    ["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get(
                    "variant_regions", None)
                cur_region = shared.subset_variant_regions(
                    variant_regions, region, out_file)
                if cur_region:
                    params += [
                        "-L",
                        bamprep.region_to_gatk(cur_region),
                        "--interval_set_rule", "INTERSECTION"
                    ]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(
                config, os.path.dirname(tx_out_file), memscale=memscale)
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
                   "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{
            file_key: out_file,
            "region": region,
            "sam_ref": ref_file,
            "config": config
        }]
    else:
        return out_file
예제 #52
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
예제 #53
0
def _mutect_call_prep(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """
    Preparation work for MuTect.
    """

    #FIXME: We assume all other bits in the config are shared

    base_config = items[0]["config"]
    dbsnp = assoc_files["dbsnp"]
    cosmic = assoc_files.get("cosmic")

    broad_runner = broad.runner_from_config(base_config, "mutect")

    mutect_version = broad_runner.get_mutect_version()

    try:
        assert mutect_version is not None
    except AssertionError:
        logger.warn("WARNING")
        logger.warn("MuTect version could not be determined from jar file. "
                    "Please ensure you are using at least version 1.1.5, "
                    "as versions 1.1.4 and lower have known issues.")
        logger.warn("Proceeding but assuming correct version 1.1.5.")
    else:
        try:
            assert LooseVersion(mutect_version) >= LooseVersion("1.1.5")
        except AssertionError:
            message = (
                "MuTect 1.1.4 and lower is known to have incompatibilities "
                "with Java < 7, and this may lead to problems in analyses. "
                "Please use MuTect 1.1.5 or higher (note that it requires "
                "Java 7).")
            raise ValueError(message)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    variant_regions = base_config["algorithm"].get("variant_regions", None)
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    region = subset_variant_regions(variant_regions, region, out_file)

    #FIXME: Add more parameters like fraction contamination etc

    params = ["-R", ref_file, "-T", "MuTect"]
    params += ["--dbsnp", dbsnp]

    tumor_bam = None
    normal_bam = None

    for bamfile, item in itertools.izip(align_bams, items):

        metadata = item["metadata"]

        if metadata["phenotype"] == "normal":
            normal_bam = bamfile
            normal_sample_name = item["name"][1]
        elif metadata["phenotype"] == "tumor":
            tumor_bam = bamfile
            tumor_sample_name = item["name"][1]

    if tumor_bam is None or normal_bam is None:
        raise ValueError("Missing phenotype definition (tumor or normal) "
                         "in samples")

    params += ["-I:normal", normal_bam]
    params += ["-I:tumor", tumor_bam]
    params += ["--tumor_sample_name", tumor_sample_name]
    params += ["--normal_sample_name", normal_sample_name]
    params += ["--fraction_contamination", contamination]

    if cosmic is not None:
        params += ["--cosmic", cosmic]

    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]

    return broad_runner, params
예제 #54
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {contig_cl} | bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    return out_file
예제 #55
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = config_utils.get_program("vardict", config)
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 region))
                vcfallelicprimitives = config_utils.get_program(
                    "vcfallelicprimitives", config)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                cmd = (
                    "{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()),
                           "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #56
0
def _mutect_call_prep(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """
    Preparation work for MuTect.
    """

    #FIXME: We assume all other bits in the config are shared

    base_config = items[0]["config"]
    dbsnp = assoc_files["dbsnp"]
    cosmic = assoc_files.get("cosmic")

    broad_runner = broad.runner_from_config(base_config, "mutect")

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)

    variant_regions = base_config["algorithm"].get("variant_regions", None)
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    region = subset_variant_regions(variant_regions, region, out_file)

    #FIXME: Add more parameters like fraction contamination etc

    params = ["-R", ref_file, "-T", "MuTect"]
    params += ["--dbsnp", dbsnp]

    tumor_bam = None
    normal_bam = None

    for bamfile, item in itertools.izip(align_bams, items):

        metadata = item["metadata"]

        if metadata["phenotype"] == "normal":
            normal_bam = bamfile
            normal_sample_name = item["name"][1]
        elif metadata["phenotype"] == "tumor":
            tumor_bam = bamfile
            tumor_sample_name = item["name"][1]

    if tumor_bam is None or normal_bam is None:
        raise ValueError("Missing phenotype definition (tumor or normal) "
                         "in samples")

    params += ["-I:normal", normal_bam]
    params += ["-I:tumor", tumor_bam]
    params += ["--tumor_sample_name", tumor_sample_name]
    params += ["--normal_sample_name", normal_sample_name]
    params += ["--fraction_contamination", contamination]

    if cosmic is not None:
        params += ["--cosmic", cosmic]

    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]

    return broad_runner, params