def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, config):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, config)
    broad_runner = broad.runner_from_config(config)
    if not broad_runner.has_gatk():
        return orig_file
    else:
        out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(config)
                params = ["-T", "VariantAnnotator",
                          "-R", ref_file,
                          "--variant", orig_file,
                          "--dbsnp", dbsnp_file,
                          "--out", tx_out_file,
                          "-L", orig_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                broad_runner = broad.runner_from_config(config)
                broad_runner.run_gatk(params, memory_retry=True)
        vcfutils.bgzip_and_index(out_file, config)
        return out_file
Example #2
0
 def get_version(config):
     from bcbio import broad
     runner = broad.runner_from_config(config)
     if type == "gatk":
         return runner.get_gatk_version()
     elif type == "picard":
         return runner.get_picard_version("ViewSam")
     elif type == "mutect":
         runner = broad.runner_from_config(config, "mutect")
         return runner.get_mutect_version()
     else:
         raise NotImplementedError(type)
Example #3
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired)
            params += _add_region_params(region, out_file, items)
            params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner = broad.runner_from_config(items[0]["config"])
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)))
            pp_cmd = _post_process_cl(paired)
            cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")
    out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Example #4
0
 def get_version(config):
     from bcbio import broad
     try:
         runner = broad.runner_from_config(config)
     except ValueError:
         return ""
     if type == "gatk":
         return runner.get_gatk_version()
     elif type == "mutect":
         try:
             runner = broad.runner_from_config(config, "mutect")
         except ValueError:
             return ""
         return runner.get_mutect_version()
     else:
         raise NotImplementedError(type)
Example #5
0
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None,
                   out_file=None, deep_coverage=False):
    """Realign a BAM file around indels using GATK, returning sorted BAM.
    """
    runner = broad.runner_from_config(config)
    runner.run_fn("picard_index", align_bam)
    runner.run_fn("picard_index_ref", ref_file)
    if not os.path.exists("%s.fai" % ref_file):
        pysam.faidx(ref_file)
    if region:
        align_bam = subset_bam_by_region(align_bam, region, out_file)
        runner.run_fn("picard_index", align_bam)
    if has_aligned_reads(align_bam, region):
        variant_regions = config["algorithm"].get("variant_regions", None)
        realign_target_file = gatk_realigner_targets(runner, align_bam,
                                                     ref_file, dbsnp, region,
                                                     out_file, deep_coverage,
                                                     variant_regions)
        realign_bam = gatk_indel_realignment(runner, align_bam, ref_file,
                                             realign_target_file, region,
                                             out_file, deep_coverage)
        # No longer required in recent GATK (> Feb 2011) -- now done on the fly
        # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
        return realign_bam
    elif out_file:
        shutil.copy(align_bam, out_file)
        return out_file
    else:
        return align_bam
Example #6
0
def _convert_bam_to_fastq(in_file, work_dir, item, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment" or
         (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, item["sam_ref"], out_bindir, config)

    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_config(config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Example #7
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    picard_runner = broad.runner_from_path("picard", config)
    picard_runner.run_fn("picard_index_ref", ref_file)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Example #8
0
def select_unaligned_read_pairs(in_bam, extra, out_dir, config):
    """Retrieve unaligned read pairs from input alignment BAM, as two fastq files.
    """
    runner = broad.runner_from_config(config)
    base, ext = os.path.splitext(os.path.basename(in_bam))
    nomap_bam = os.path.join(out_dir, "{}-{}{}".format(base, extra, ext))
    if not utils.file_exists(nomap_bam):
        with file_transaction(nomap_bam) as tx_out:
            runner.run("FilterSamReads", [("INPUT", in_bam),
                                          ("OUTPUT", tx_out),
                                          ("EXCLUDE_ALIGNED", "true"),
                                          ("WRITE_READS_FILES", "false"),
                                          ("SORT_ORDER", "queryname")])
    has_reads = False
    with closing(pysam.Samfile(nomap_bam, "rb")) as in_pysam:
        for read in in_pysam:
            if read.is_paired:
                has_reads = True
                break
    if has_reads:
        out_fq1, out_fq2 = ["{}-{}.fq".format(os.path.splitext(nomap_bam)[0], i) for i in [1, 2]]
        runner.run_fn("picard_bam_to_fastq", nomap_bam, out_fq1, out_fq2)
        return out_fq1, out_fq2
    else:
        return None, None
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data):
    """Run variant quality score recalibration.
    """
    cutoffs = ["100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91",
               "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0"]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    base = utils.splitext_plus(in_file)[0]
    recal_file = "%s.recal" % base
    tranches_file = "%s.tranches" % base
    if not utils.file_exists(recal_file):
        with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
            params = ["-T", "VariantRecalibrator",
                      "-R", ref_file,
                      "--input", in_file,
                      "--mode", filter_type,
                      "--recal_file", tx_recal,
                      "--tranches_file", tx_tranches]
            for cutoff in cutoffs:
                params += ["-tranche", str(cutoff)]
            params += _get_vqsr_training(filter_type, vrn_files)
            for a in _get_vqsr_annotations(filter_type):
                params += ["-an", a]
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params, log_error=False)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    return recal_file, tranches_file
Example #10
0
def gatk_splitreads(data):
    """
    use GATK to split reads with Ns in the CIGAR string, hard clipping regions
    that end up in introns
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    deduped_bam = dd.get_deduped_bam(data)
    base, ext = os.path.splitext(deduped_bam)
    split_bam = base + ".splitN" + ext
    if dd.get_quality_format(data) == "illumina":
        quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"]
    else:
        quality_flag = []
    if file_exists(split_bam):
        data = dd.set_split_bam(data, split_bam)
        return data
    with file_transaction(split_bam) as tx_split_bam:
        params = ["-T", "SplitNCigarReads",
                  "-R", ref_file,
                  "-I", deduped_bam,
                  "-o", tx_split_bam,
                  "-rf", "ReassignOneMappingQuality",
                  "-RMQF", "255",
                  "-RMQT", "60",
                  "-rf", "UnmappedRead",
                  "-U", "ALLOW_N_CIGAR_READS"] + quality_flag
        broad_runner.run_gatk(params)
    bam.index(split_bam, dd.get_config(data))
    data = dd.set_split_bam(data, split_bam)
    return data
Example #11
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Example #12
0
def _piped_input_cl(data, region, tmp_dir, out_base_file, prep_params):
    """Retrieve the commandline for streaming input into preparation step.
    If marking duplicates, this requires writing an intermediate file since
    MarkDuplicates uses multiple passed on an input.
    """
    broad_runner = broad.runner_from_config(data["config"])
    cl = _gatk_extract_reads_cl(data, region, prep_params, tmp_dir)
    if prep_params["dup"] == "picard":
        sel_file = "%s-select%s" % os.path.splitext(out_base_file)
        if not utils.file_exists(sel_file):
            with file_transaction(sel_file) as tx_out_file:
                cl += ["-o", tx_out_file]
                do.run(cl, "GATK: PrintReads {0}".format(region), data)
        dup_metrics = "%s-dup.dup_metrics" % os.path.splitext(out_base_file)[0]
        compression = "5" if prep_params["realign"] == "gatk" else "0"
        cl = broad_runner.cl_picard("MarkDuplicates",
                                    [("INPUT", sel_file),
                                     ("OUTPUT", "/dev/stdout"),
                                     ("METRICS_FILE", dup_metrics),
                                     ("PROGRAM_RECORD_ID", "null"),
                                     ("COMPRESSION_LEVEL", compression),
                                     ("TMP_DIR", tmp_dir)])
    elif not prep_params["dup"]:
        sel_file = data["work_bam"]
    else:
        raise ValueError("Duplication approach not supported with GATK: %s" % prep_params["dup"])
    broad_runner.run_fn("picard_index", sel_file)
    return sel_file, " ".join(cl)
Example #13
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases",
                  "-stand_call_conf", "20.0",
                  "-stand_emit_conf", "20.0"]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
Example #14
0
def _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    broad_runner.run_fn("picard_index", align_bam)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    region = subset_variant_regions(variant_regions, region, out_file)

    bfh = config["resources"]["gatk"].get("max_bam_file_handle",1024)

    params = ["-I", align_bam,
              "-R", ref_file,
              "-bfh", str(bfh),
              "--annotation", "QualByDepth",
              "--annotation", "HaplotypeScore",
              "--annotation", "MappingQualityRankSumTest",
              "--annotation", "ReadPosRankSumTest",
              "--annotation", "FisherStrand",
              "--annotation", "RMSMappingQuality",
              "--annotation", "DepthOfCoverage",
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              ]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params, out_file
Example #15
0
def gatk_evaluate_variants(vcf_file, ref_file, config, dbsnp=None, intervals=None):
    """Evaluate variants, return SNP counts and Transition/Transversion ratios.
    """
    runner = broad.runner_from_config(config)
    eval_file = variant_eval(vcf_file, ref_file, dbsnp, intervals, runner)
    stats = _extract_eval_stats(eval_file)
    return _format_stats(stats['called'])
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file, config):
    """Run BAM recalibration with the given input
    """
    if not file_exists(out_file):
        if _recal_available(recal_file):
            broad_runner = broad.runner_from_config(config)
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "PrintReads",
                              "-BQSR", recal_file,
                              "-R", ref_file,
                              "-I", dup_align_bam,
                              "--out", tx_out_file,
                              ]
                    base_bed = config["algorithm"].get("variant_regions", None)
                    region_bed = subset_variant_regions(base_bed, region, tx_out_file)
                    if region_bed:
                        params += ["-L", region_bed, "--interval_set_rule", "INTERSECTION"]
                    elif region:
                        params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        elif region:
            subset_bam_by_region(dup_align_bam, region, out_file)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Example #17
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine multiple VCF files into a single output file.

    Handles complex merging of samples and other tricky issues using GATK.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    broad_runner = broad.runner_from_config(config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, orig_file in enumerate(orig_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), orig_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            variant_regions = config["algorithm"].get("variant_regions", None)
            cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
            if cur_region:
                params += ["-L", bamprep.region_to_gatk(cur_region),
                           "--interval_set_rule", "INTERSECTION"]
            broad_runner.run_gatk(params)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Example #18
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Example #19
0
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                # Recent GATK 3.x versions also have race conditions with multiple
                # threads, so limit to 1 and keep memory available
                # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs
                # params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale, parallel_gc=True)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #20
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type,
                                 ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)
Example #21
0
def get_gatk_annotations(config, include_depth=True, include_baseqranksum=True,
                         gatk_input=True):
    """Retrieve annotations to use for GATK VariantAnnotator.

    If include_depth is false, we'll skip annotating DP. Since GATK downsamples
    this will undercount on high depth sequencing and the standard outputs
    from the original callers may be preferable.

    BaseQRankSum can cause issues with some MuTect2 and other runs, so we
    provide option to skip it.
    """
    broad_runner = broad.runner_from_config(config)
    anns = ["MappingQualityRankSumTest", "MappingQualityZero",
            "QualByDepth", "ReadPosRankSumTest", "RMSMappingQuality"]
    if include_baseqranksum:
        anns += ["BaseQualityRankSumTest"]
    # Some annotations not working correctly with external datasets and GATK 3
    if gatk_input or broad_runner.gatk_type() == "gatk4":
        anns += ["FisherStrand"]
    if broad_runner.gatk_type() == "gatk4":
        anns += ["MappingQuality"]
    else:
        anns += ["GCContent", "HaplotypeScore", "HomopolymerRun"]
    if include_depth:
        anns += ["DepthPerAlleleBySample"]
        if broad_runner.gatk_type() in ["restricted", "gatk4"]:
            anns += ["Coverage"]
        else:
            anns += ["DepthOfCoverage"]
    return anns
Example #22
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data)))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file,
                                                     platform, dbsnp_file, intervals, data)
    return [[data]]
Example #23
0
def read_backed_phasing(vcf_file, bam_files, genome_file, region, config):
    """Phase variants using GATK's read-backed phasing.
    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_phasing_ReadBackedPhasing.html
    """
    if has_variants(vcf_file):
        broad_runner = broad.runner_from_config(config)
        out_file = "%s-phased%s" % os.path.splitext(vcf_file)
        if not file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                params = ["-T", "ReadBackedPhasing",
                          "-R", genome_file,
                          "--variant", vcf_file,
                          "--out", tx_out_file,
                          "--downsample_to_coverage", "250",
                          "--downsampling_type", "BY_SAMPLE"]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                variant_regions = config["algorithm"].get("variant_regions", None)
                region = shared.subset_variant_regions(variant_regions, region, out_file)
                if region:
                    params += ["-L", bamprep.region_to_gatk(region),
                               "--interval_set_rule", "INTERSECTION"]
                broad_runner.run_gatk(params)
        return out_file
    else:
        return vcf_file
Example #24
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    if file_exists(out_file):
        return out_file
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    picard = broad.runner_from_config(config)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    return out_file
Example #25
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = data["files"]
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, aligner, data)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        _check_prealigned_bam(fastq1, data["sam_ref"], config)
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    data["work_bam"] = out_bam
    return [[data]]
Example #26
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", "250",
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Example #27
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Example #28
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--downsample_to_coverage", max(200, get_in(paired.tumor_config,
                                                           ("algorithm", "coverage_depth_max"), 10000))]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Example #29
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name,
                    rg_name, lane_name, config):
    """Convert SAM file to merged and sorted BAM file.
    """
    picard = broad.runner_from_config(config)
    platform = config["algorithm"]["platform"]
    qual_format = config["algorithm"].get("quality_format", None)
    base_dir = os.path.dirname(sam_file)

    picard.run_fn("picard_index_ref", ref_file)
    out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2,
                                  base_dir, platform, sample_name, rg_name, lane_name,
                                  qual_format)
    out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file,
                            fastq2 is not None)
    sort_bam = picard.run_fn("picard_sort", out_bam)

    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
    utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config)
    utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
    # merge FASTQ files, only if barcoded samples in the work directory
    if (os.path.commonprefix([fastq1, sort_bam]) ==
             os.path.split(os.path.dirname(sort_bam))[0]
          and not config["algorithm"].get("upload_fastq", True)):
        utils.save_diskspace(fastq1, "Merged into output BAM %s" % out_bam, config)
        if fastq2:
            utils.save_diskspace(fastq2, "Merged into output BAM %s" % out_bam, config)
    return sort_bam
Example #30
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
Example #31
0
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
Example #32
0
def variant_filtration(call_file, ref_file, vrn_files, config):
    """Filter variant calls using Variant Quality Score Recalibration.
    """
    broad_runner = broad.runner_from_config(config)
    snp_file, indel_file = split_snps_indels(broad_runner, call_file, ref_file)
    snp_filter_file = _variant_filtration_snp(broad_runner, snp_file, ref_file,
                                              vrn_files, config)
    indel_filter_file = _variant_filtration_indel(broad_runner, indel_file,
                                                  ref_file, vrn_files, config)
    orig_files = [snp_filter_file, indel_filter_file]
    out_file = "{base}combined.vcf".format(
        base=os.path.commonprefix(orig_files))
    return combine_variant_files(orig_files, out_file, ref_file, config)
Example #33
0
def run_region(data, region, vrn_files, out_file):
    """Perform variant calling on gVCF inputs in a specific genomic region.
    """
    broad_runner = broad.runner_from_config(data["config"])
    if broad_runner.gatk_type() == "gatk4":
        genomics_db = _run_genomicsdb_import(vrn_files, region, out_file, data)
        return _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file,
                                              data)
    else:
        vrn_files = _batch_gvcfs(data, region, vrn_files,
                                 dd.get_ref_file(data), out_file)
        return _run_genotype_gvcfs_gatk3(data, region, vrn_files,
                                         dd.get_ref_file(data), out_file)
Example #34
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in tz.get_in(["algorithm", "tools_off"], config):
                config["algorithm"]["tools_off"].remove("gatk4")
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Example #35
0
def read_backed_phasing(vcf_file, bam_file, genome_file, config):
    """Annotate predicted variant effects using snpEff.
    """
    broad_runner = broad.runner_from_config(config)
    out_file = "%s-phased%s" % os.path.splitext(vcf_file)
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            params = [
                "-T", "ReadBackedPhasing", "-R", genome_file, "-I", bam_file,
                "--variant", vcf_file, "--out", tx_out_file
            ]
            broad_runner.run_gatk(params)
    return out_file
Example #36
0
def get_gatk_annotations(config):
    broad_runner = broad.runner_from_config(config)
    anns = [
        "BaseQualityRankSumTest", "FisherStrand", "GCContent",
        "HaplotypeScore", "HomopolymerRun", "MappingQualityRankSumTest",
        "MappingQualityZero", "QualByDepth", "ReadPosRankSumTest",
        "RMSMappingQuality", "DepthPerAlleleBySample"
    ]
    if broad_runner.gatk_type() == "restricted":
        anns += ["Coverage"]
    else:
        anns += ["DepthOfCoverage"]
    return anns
Example #37
0
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data,
                         out_file=None):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    broad_runner = broad.runner_from_config_safe(data["config"])
    if not broad_runner or not broad_runner.has_gatk() or broad_runner.gatk_type() == "gatk4":
        if dbsnp_file:
            return add_dbsnp(orig_file, dbsnp_file, data, out_file)
        else:
            return orig_file
    else:
        if out_file is None:
            out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(data["config"], include_depth=False)
                params = ["-T", "VariantAnnotator",
                          "-R", ref_file,
                          "--variant", orig_file,
                          "--out", tx_out_file,
                          "-L", orig_file]
                if dbsnp_file:
                    params += ["--dbsnp", dbsnp_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                if ("--allow_potentially_misencoded_quality_scores" not in params
                      and "-allowPotentiallyMisencodedQuals" not in params):
                    params += ["--allow_potentially_misencoded_quality_scores"]
                # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq)
                # start by removing existing -U or --unsafe opts
                # (if another option is added to Gatk that starts with -U... this may create a bug)
                unsafe_options = [x for x in params if x.startswith(("-U", "--unsafe"))]
                for my_opt in unsafe_options:
                    ind_to_rem = params.index(my_opt)
                    # are the options given as separate strings or in one?
                    if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe":
                        params.pop(ind_to_rem + 1)
                    params.pop(ind_to_rem)
                params.extend(["-U", "ALL"])
                broad_runner = broad.runner_from_config(data["config"])
                broad_runner.run_gatk(params)
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Example #38
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file.
    """
    requires_gatkfull = False
    args = ["-T", "PrintReads",
            "-L", region_to_gatk(region),
            "-R", dd.get_ref_file(data),
            "-I", data["work_bam"]]
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
        return broad.gatk_cmd("gatk-framework", jvm_opts, args)
Example #39
0
def run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file]
            if region:
                params += ["-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #40
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file, recalibrating if configured.
    """
    broad_runner = broad.runner_from_config(data["config"])
    args = ["-T", "PrintReads",
            "-L", region_to_gatk(region),
            "-R", data["sam_ref"],
            "-I", data["work_bam"]]
    if prep_params["recal"] == "gatk":
        if _recal_has_reads(data["prep_recal"]):
            args += ["-BQSR", data["prep_recal"]]
    elif prep_params["recal"]:
        raise NotImplementedError("Recalibration method %s" % prep_params["recal"])
    return broad_runner.cl_gatk(args, tmp_dir)
Example #41
0
def _cnn_score_variants(in_file, tensor_type, data):
    """Score variants with pre-trained CNN models.
    """
    out_file = "%s-cnnscore.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        runner = broad.runner_from_config(data["config"])
        gatk_type = runner.gatk_type()
        assert gatk_type == "gatk4", "CNN filtering requires GATK4"
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CNNScoreVariants", "--variant", in_file, "--reference", dd.get_ref_file(data),
                    "--output", tx_out_file, "--input", dd.get_align_bam(data)]
            params += ["--tensor-type", tensor_type]
            runner.run_gatk(params)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #42
0
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R",
                ref_file, "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_file,
                                             tx_out_file)
                cmd = "{gatk_cmd} -O {tx_raw_file} && {filter_cmd}"
            else:
                cmd = "{gatk_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")

    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Example #43
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for input BAM using Broad's CallableLoci walker.

    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html
    """
    broad_runner = broad.runner_from_config(data["config"])
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    out_summary = "%s-callable-summary.txt" % os.path.splitext(
        data["work_bam"])[0]
    variant_regions = data["config"]["algorithm"].get("variant_regions", None)
    # set a maximum depth to avoid calling in repetitive regions with excessive coverage
    max_depth = int(1e6 if data["config"]["algorithm"].get(
        "coverage_depth", "").lower() == "super-high" else 2.5e4)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bam.index(data["work_bam"], data["config"])
            params = [
                "-T", "CallableLoci", "-R", data["sam_ref"], "-I",
                data["work_bam"], "--minDepth", "0",
                "--downsample_to_coverage",
                str(max_depth + 1000), "--minMappingQuality", "0",
                "--maxFractionOfReadsWithLowMAPQ", "1.1", "--maxDepth",
                str(max_depth), "--out", tx_out_file, "--summary", out_summary
            ]
            ready_region = shared.subset_variant_regions(
                variant_regions, region, tx_out_file)
            if ready_region:
                params += ["-L", ready_region]
            if ((variant_regions and ready_region
                 and os.path.isfile(ready_region)) or not variant_regions
                    or not region):
                broad_runner.run_gatk(params,
                                      data=data,
                                      region=region,
                                      memory_retry=True)
            else:
                with open(out_file, "w") as out_handle:
                    for tregion in get_ref_bedtool(data["sam_ref"],
                                                   data["config"]):
                        if tregion.chrom == region:
                            out_handle.write(
                                "%s\t%s\t%s\tNO_COVERAGE\n" %
                                (tregion.chrom, tregion.start, tregion.stop))
    return [{
        "callable_bed": out_file,
        "config": data["config"],
        "work_bam": data["work_bam"]
    }]
def main(config_file, in_file, space, start, end):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    runner = broad.runner_from_config(config)
    target_region = (space, int(start), int(end))
    for pair in [1, 2]:
        out_file = "%s_%s-%s.fastq" % (os.path.splitext(
            os.path.basename(in_file))[0], pair, target_region[0])
        with open(out_file, "w") as out_handle:
            for name, seq, qual in bam_to_fastq_pair(in_file, target_region,
                                                     pair):
                out_handle.write("@%s/%s\n%s\n+\n%s\n" %
                                 (name, pair, seq, qual))
        sort_fastq(out_file, runner)
Example #45
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    if "files" not in data:
        fastq1, fastq2 = None, None
    elif len(data["files"]) == 2:
        fastq1, fastq2 = data["files"]
    else:
        assert len(data["files"]) == 1, data["files"]
        fastq1, fastq2 = data["files"][0], None
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"],
                                           data["sam_ref"], data["dirs"],
                                           config)
        elif sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = link_bam_file(
                fastq1,
                os.path.join(data["dirs"]["work"], "prealign",
                             data["rgnames"]["sample"]))
        bam.check_header(out_bam, data["rgnames"], data["sam_ref"],
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        data["work_bam"] = dedup_bam
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = ""
        data["work_bam"] = None
    else:
        raise ValueError("Could not process input file: %s" % fastq1)
    return [[data]]
Example #46
0
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    bam_files.sort()
    out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
    picard = broad.runner_from_config(config)
    if len(bam_files) == 1:
        if not os.path.exists(out_file):
            os.symlink(bam_files[0], out_file)
    else:
        picard.run_fn("picard_merge", bam_files, out_file)
        for b in bam_files:
            utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Example #47
0
def combine_variant_files(orig_files,
                          out_file,
                          ref_file,
                          config,
                          quiet_out=True,
                          region=None):
    """Combine multiple VCF files into a single output file.

    Handles complex merging of samples and other tricky issues using GATK.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    broad_runner = broad.runner_from_config(config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            params = [
                "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file
            ]
            priority_order = []
            for i, orig_file in enumerate(orig_files):
                name = "v%s" % i
                params.extend(
                    ["--variant:{name}".format(name=name), orig_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            if quiet_out:
                params.extend(
                    ["--suppressCommandLineHeader", "--setKey", "null"])
            variant_regions = config["algorithm"].get("variant_regions", None)
            cur_region = shared.subset_variant_regions(variant_regions, region,
                                                       out_file)
            if cur_region:
                params += [
                    "-L",
                    bamprep.region_to_gatk(cur_region), "--interval_set_rule",
                    "INTERSECTION"
                ]
            broad_runner.run_gatk(params)
    if in_pipeline:
        return [{
            file_key: out_file,
            "region": region,
            "sam_ref": ref_file,
            "config": config
        }]
    else:
        return out_file
Example #48
0
def gatk_recalibrate(dup_align_bam, ref_file, config, snp_file=None):
    """Perform a GATK recalibration of the sorted aligned BAM,
    producing recalibrated BAM.
    """
    broad_runner = broad.runner_from_config(config)
    platform = config["algorithm"]["platform"]
    broad_runner.run_fn("picard_index_ref", ref_file)
    recal_file = _gatk_count_covariates(broad_runner, dup_align_bam, \
                                        ref_file, platform, snp_file)
    recal_bam = _gatk_table_recalibrate(broad_runner, dup_align_bam, ref_file, \
                                        recal_file, platform)
    broad_runner.run_fn("picard_index", recal_bam)

    return recal_bam
Example #49
0
def _needs_java(data):
    """Check if a caller needs external java for MuTect or older GATK 3.6.
    """
    vc = dd.get_variantcaller(data)
    if not isinstance(vc, (list, tuple)):
        vc = [vc]
    if "mutect" in vc:
        return True
    if "gatk" in vc or "gatk-haplotype" in vc:
        runner = broad.runner_from_config(data["config"])
        version = runner.get_gatk_version()
        if LooseVersion(version) < LooseVersion("3.6"):
            return True
    return False
Example #50
0
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type,
              data):
    """Run variant quality score recalibration.
    """
    cutoffs = [
        "100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93",
        "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0",
        "98.0", "90.0"
    ]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    base = utils.splitext_plus(in_file)[0]
    recal_file = "%s.recal" % base
    tranches_file = "%s.tranches" % base
    plot_file = "%s-plots.R" % base
    if not utils.file_exists(recal_file):
        with file_transaction(data, recal_file, tranches_file,
                              plot_file) as (tx_recal, tx_tranches,
                                             tx_plot_file):
            params = [
                "-T", "VariantRecalibrator", "-R", ref_file, "--input",
                in_file, "--mode", filter_type, "--recal_file", tx_recal,
                "--tranches_file", tx_tranches, "--rscript_file", tx_plot_file
            ]
            params += _get_vqsr_training(filter_type, vrn_files)
            resources = config_utils.get_resources("gatk_variant_recalibrator",
                                                   data["config"])
            opts = resources.get("options", [])
            if not opts:
                for cutoff in cutoffs:
                    opts += ["-tranche", str(cutoff)]
                for a in _get_vqsr_annotations(filter_type):
                    opts += ["-an", a]
            params += opts
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params,
                                      log_error=False,
                                      memscale=memscale)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    return recal_file, tranches_file
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.

    Normalized qualities to 3 bin outputs at 10, 20 and 30 based on pipeline standard
    recommendations, which will help with output file sizes:
    https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme
    https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/5585cdf7877104f2c61b2720ddfe7235f2fad577/PairedEndSingleSampleWf.gatk4.0.wdl#L1081

    spark host and timeout settings help deal with runs on restricted systems
    where we encounter network and timeout errors
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                resources = config_utils.get_resources("gatk-spark", data["config"])
                spark_opts = [str(x) for x in resources.get("options", [])]
                params = ["-T", "ApplyBQSRSpark",
                          "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"],
                          "--static-quantized-quals", "10", "--static-quantized-quals", "20",
                          "--static-quantized-quals", "30"]
                if spark_opts:
                    params += spark_opts
                else:
                    params += ["--spark-master", "local[%s]" % cores,
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                               "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800"]
                # Avoid problems with StreamClosedErrors on GATK 4.1+
                # https://github.com/bcbio/bcbio-nextgen/issues/2806#issuecomment-492504497
                params += ["--create-output-bam-index", "false"]
            else:
                params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file,
                          "-BQSR", data["prep_recal"], "-o", tx_out_file]
            # Avoid problems with intel deflater for GATK 3.8 and GATK4
            # https://github.com/bcbio/bcbio-nextgen/issues/2145#issuecomment-343095357
            if gatk_type == "gatk4":
                params += ["--jdk-deflater", "--jdk-inflater"]
            elif LooseVersion(broad_runner.gatk_major_version()) > LooseVersion("3.7"):
                params += ["-jdk_deflater", "-jdk_inflater"]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=True)
    bam.index(out_file, data["config"])
    return out_file
Example #52
0
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False):
    num_cores = dd.get_num_cores(data)
    memscale = {
        "magnitude": 0.9 * num_cores,
        "direction": "increase"
    } if num_cores > 1 else None
    # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling
    config = utils.deepish_copy(data["config"])
    if "gatk4" in dd.get_tools_off({"config": config}):
        config["algorithm"]["tools_off"].remove("gatk4")
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_gatk(params,
                          os.path.dirname(tx_out_file),
                          memscale=memscale,
                          ld_preload=ld_preload)
Example #53
0
def _graphs_and_summary(bam_file, sam_ref, tmp_dir, config):
    """Prepare picard/FastQC graphs and summary details.
    """
    bait = config["algorithm"].get("hybrid_bait", None)
    target = config["algorithm"].get("hybrid_target", None)
    broad_runner = runner_from_config(config)
    metrics = PicardMetrics(broad_runner, tmp_dir)
    summary_table, metrics_graphs = \
                   metrics.report(bam_file, sam_ref, is_paired(bam_file), bait, target)
    metrics_graphs = [(p, c, 0.75) for p, c in metrics_graphs]
    fastqc_graphs, fastqc_stats, fastqc_overrep = \
                   fastqc_report(bam_file, config)
    all_graphs = fastqc_graphs + metrics_graphs
    summary_table = _update_summary_table(summary_table, sam_ref, fastqc_stats)
    return all_graphs, summary_table, fastqc_overrep
Example #54
0
def picard_prep(in_bam, names, ref_file, dirs, config):
    """Prepare input BAM using Picard and GATK cleaning tools.

    - ReorderSam to reorder file to reference
    - AddOrReplaceReadGroups to add read group information and coordinate sort
    - PrintReads to filters to remove problem records:
    - filterMBQ to remove reads with mismatching bases and base qualities
    """
    runner = broad.runner_from_config(config)
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", names["sample"]))
    reorder_bam = os.path.join(work_dir, "%s-reorder.bam" %
                               os.path.splitext(os.path.basename(in_bam))[0])
    reorder_bam = runner.run_fn("picard_reorder", in_bam, ref_file, reorder_bam)
    rg_bam = runner.run_fn("picard_fix_rgs", reorder_bam, names)
    return _filter_bad_reads(rg_bam, ref_file, runner, config)
Example #55
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf"
                               in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
    return out_file
Example #56
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for input BAM using Broad's CallableLoci walker.

    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html
    """
    if data["config"].get("parallel", {}).get("log_queue"):
        handler = setup_local_logging(data["config"],
                                      data["config"]["parallel"])
    else:
        handler = None
    broad_runner = broad.runner_from_config(data["config"])
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    out_summary = "%s-callable-summary.txt" % os.path.splitext(
        data["work_bam"])[0]
    variant_regions = data["config"]["algorithm"].get("variant_regions", None)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            broad_runner.run_fn("picard_index", data["work_bam"])
            params = [
                "-T", "CallableLoci", "-R", data["sam_ref"], "-I",
                data["work_bam"], "--out", tx_out_file, "--summary",
                out_summary
            ]
            ready_region = shared.subset_variant_regions(
                variant_regions, region, tx_out_file)
            if ready_region:
                params += ["-L", ready_region]
            if ((variant_regions and ready_region
                 and os.path.isfile(ready_region)) or not variant_regions
                    or not region):
                broad_runner.run_gatk(params)
            else:
                with open(out_file, "w") as out_handle:
                    for tregion in get_ref_bedtool(data["sam_ref"],
                                                   data["config"]):
                        if tregion.chrom == region:
                            out_handle.write(
                                "%s\t%s\t%s\tNO_COVERAGE\n" %
                                (tregion.chrom, tregion.start, tregion.stop))
    if handler and hasattr(handler, "close"):
        handler.close()
    return [{
        "callable_bed": out_file,
        "config": data["config"],
        "work_bam": data["work_bam"]
    }]
Example #57
0
def _shared_gatk_call_prep(align_bams,
                           items,
                           ref_file,
                           region,
                           out_file,
                           num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    if _use_spark(num_cores, gatk_type):
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += [
                "-L",
                bamprep.region_to_gatk(region), "--interval-set-rule",
                "INTERSECTION"
            ]
        else:
            params += [
                "-L",
                bamprep.region_to_gatk(region), "--interval_set_rule",
                "INTERSECTION"
            ]
    params += standard_cl_params(items)
    return broad_runner, params
Example #58
0
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data):
    """Filter SNP variant calls using GATK best practice recommendations.
    """
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    filter_type = "SNP"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    filters = [
        "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5",
        "ReadPosRankSum < -8.0"
    ]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.hard_w_expression(snp_file, " || ".join(filters), data,
                                         filter_type)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(
            base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_snp(snp_file, ref_file, vrn_files, data)
        assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \
            "Need HapMap and 1000 genomes training files"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file,
                                  tranches_file) as (tx_recal, tx_tranches):
                params.extend(
                    ["--recal_file", tx_recal, "--tranches_file", tx_tranches])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                # Can fail to run if not enough values are present to train. Rerun with regional
                # filtration approach instead
                except:
                    logger.info(
                        "VQSR failed due to lack of training data. Using hard filtering."
                    )
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_snp(snp_file, ref_file,
                                                   vrn_files, data)
        return _apply_variant_recal(broad_runner, snp_file, ref_file,
                                    recal_file, tranches_file, filter_type)
Example #59
0
def annotate_effects(orig_file, snpeff_file, genome_file, config):
    """Annotate predicted variant effects using snpEff.
    """
    broad_runner = broad.runner_from_config(config)
    out_file = "%s-annotated%s" % os.path.splitext(orig_file)
    # Avoid generalization since 2.0.3 is not working
    #snpeff_file = _general_snpeff_version(snpeff_file)
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            params = [
                "-T", "VariantAnnotator", "-R", genome_file, "-A", "SnpEff",
                "--variant", orig_file, "--snpEffFile", snpeff_file, "--out",
                tx_out_file
            ]
            broad_runner.run_gatk(params)
    return out_file
Example #60
0
def _rnaseq_graphs_and_summary(bam_file, sam_ref, refflat_file, rrna_file,
                               qc_dir, tmp_dir, config):
    """Prepare picard/FastQC graphs and summary details.
    """
    broad_runner = runner_from_config(config)
    metrics = RNASeqPicardMetrics(broad_runner, tmp_dir)
    summary_table, metrics_graphs = metrics.report(bam_file, sam_ref,
                                                   refflat_file,
                                                   is_paired(bam_file),
                                                   rrna_file)
    metrics_graphs = [(p, c, 0.75) for p, c in metrics_graphs]
    fastqc_graphs, fastqc_stats, fastqc_overrep = \
                   fastqc_report(bam_file, qc_dir, config)
    all_graphs = fastqc_graphs + metrics_graphs
    summary_table = _update_summary_table(summary_table, sam_ref, fastqc_stats)
    return all_graphs, summary_table, fastqc_overrep