def variantcall_sample(data, region=None, align_bams=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file): utils.safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if len(align_bams) == 1: items = [data] else: items = multi.get_orig_items(data) assert len(items) == len(align_bams) assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} for bam_file in align_bams: bam.index(bam_file, data["config"], check_timestamp=False) do_phasing = data["config"]["algorithm"].get("phasing", False) call_file = "%s-raw%s" % utils.splitext_plus(out_file) if do_phasing else out_file call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file) if do_phasing == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) if region: data["region"] = region data["vrn_file"] = out_file return [data]
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(data, pa_bam) as tx_out_file: cmd = "{cl} -o {tx_out_file}".format(**locals()) do.run(cmd, "GATK re-alignment {0}".format(region), data) bam.index(pa_bam, data["config"]) recal_file = realign.gatk_realigner_targets( broad_runner, pa_bam, data["sam_ref"], data["config"], region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data)) recal_cl = realign.gatk_indel_realignment_cl( broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data)) return pa_bam, recal_cl
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): p_out_file = out_file.replace(".vcf.gz", ".vcf") with file_transaction(items[0], p_out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=%s" % tx_out_file, "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] do.run(cmd, "platypus variant calling") if p_out_file != out_file: post_process_cmd = "%s | vcfallelicprimitives | vcfstreamsort" % vcfutils.fix_ambiguous_cl() b_out_file = vcfutils.bgzip_and_index(p_out_file, items[0]["config"], prep_cmd=post_process_cmd) assert b_out_file == out_file return out_file
def gatk_splitreads(data): """ use GATK to split reads with Ns in the CIGAR string, hard clipping regions that end up in introns """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) deduped_bam = dd.get_deduped_bam(data) base, ext = os.path.splitext(deduped_bam) split_bam = base + ".splitN" + ext if dd.get_quality_format(data) == "illumina": quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"] else: quality_flag = [] if file_exists(split_bam): data = dd.set_split_bam(data, split_bam) return data with file_transaction(split_bam) as tx_split_bam: params = ["-T", "SplitNCigarReads", "-R", ref_file, "-I", deduped_bam, "-o", tx_split_bam, "-rf", "ReassignOneMappingQuality", "-RMQF", "255", "-RMQT", "60", "-rf", "UnmappedRead", "-U", "ALLOW_N_CIGAR_READS"] + quality_flag broad_runner.run_gatk(params) bam.index(split_bam, dd.get_config(data)) data = dd.set_split_bam(data, split_bam) return data
def _prep_inputs(align_bams, ref_file, items): """Ensure inputs to calling are indexed as expected. """ broad_runner = broad.runner_from_path("picard", items[0]["config"]) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, items[0]["config"])
def variantcall_sample(data, region=None, align_bams=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists( out_file): utils.safe_makedir(os.path.dirname(out_file)) ref_file = dd.get_ref_file(data) config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller")] if len(align_bams) == 1: items = [data] else: items = multi.get_orig_items(data) assert len(items) == len(align_bams) assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} for bam_file in align_bams: bam.index(bam_file, data["config"], check_timestamp=False) out_file = caller_fn(align_bams, items, ref_file, assoc_files, region, out_file) if region: data["region"] = region data["vrn_file"] = out_file return [data]
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_name} {paired.tumor_name}" " - {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config["algorithm"], out_file, region)) cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) clean_vcf_output(out_file, _clean_freebayes_output, "nodups") ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def correct_umis(data): """Correct umis against the whitelist in correct_umi_file http://fulcrumgenomics.github.io/fgbio/tools/latest/CorrectUmis.html """ input_bam = dd.get_work_bam(data) output_bam = os.path.join( utils.safe_makedir( os.path.join(os.getcwd(), "align", dd.get_sample_name(data))), "%s-umis_corrected%s" % utils.splitext_plus(os.path.basename(input_bam))) jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(output_bam), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" umis_whitelist = tz.get_in(["config", "algorithm", "correct_umis"], data) fgbio = config_utils.get_program("fgbio", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) if not utils.file_exists(output_bam): umi_method, umi_tag = _check_umi_type(input_bam) cmd = ( "unset JAVA_HOME && " "{fgbio} {jvm_opts} {io_opts} CorrectUmis " "-t {umi_tag} -m 3 -d 1 -x " "-U {umis_whitelist} " "-i {input_bam} -o /dev/stdout | {samtools} view -bh > {output_bam}" ) do.run(cmd.format(**locals()), "Correcting UMIs") bam.index(output_bam, data["config"]) return output_bam
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ raise NotImplementedError("Cortex currently out of date and needs reworking.") if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle] combine_file = "{0}-raw{1}".format(*os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None, deep_coverage=False): """Realign a BAM file around indels using GATK, returning sorted BAM. """ runner = broad.runner_from_config(config) bam.index(align_bam, config) runner.run_fn("picard_index_ref", ref_file) if not os.path.exists("%s.fai" % ref_file): pysam.faidx(ref_file) if region: align_bam = subset_bam_by_region(align_bam, region, out_file) bam.index(align_bam, config) if has_aligned_reads(align_bam, region): variant_regions = config["algorithm"].get("variant_regions", None) realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, region, out_file, deep_coverage, variant_regions) realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, region, out_file, deep_coverage) # No longer required in recent GATK (> Feb 2011) -- now done on the fly # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam) return realign_bam elif out_file: shutil.copy(align_bam, out_file) return out_file else: return align_bam
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file, data): """Retrieve summary of coverage in a region. Requires positive non-zero mapping quality at a position, matching GATK's CallableLoci defaults. """ out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: bam.index(in_bam, data["config"]) fai_file = ref.fasta_idx(ref_file, data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) max_depth = depth["max"] + 1 cmd = ( "{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | " "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} -max {max_depth} " "> {tx_out_file}") do.run(cmd.format(**locals()), "bedtools genomecov: %s" % (str(region)), data) # Empty output file, no coverage for the whole contig if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, 0)) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar", ] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params do.run(cmd, "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(data, pa_bam) as tx_out_file: cmd = "{cl} -o {tx_out_file}".format(**locals()) do.run(cmd, "GATK pre-alignment {0}".format(region), data) bam.index(pa_bam, data["config"]) recal_file = realign.gatk_realigner_targets( broad_runner, pa_bam, data["sam_ref"], data["config"], region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data), ) recal_cl = realign.gatk_indel_realignment_cl( broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data), ) return pa_bam, " ".join(recal_cl)
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def split_ATAC(data, bam_file=None): """ splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based on the estimated insert sizes uses the current working BAM file if no BAM file is supplied """ sambamba = config_utils.get_program("sambamba", data) num_cores = dd.get_num_cores(data) base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} ' bam_file = bam_file if bam_file else dd.get_work_bam(data) out_stem = os.path.splitext(bam_file)[0] split_files = {} # we can only split these fractions from paired runs if not bam.is_paired(bam_file): split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data for arange in ATACRanges.values(): out_file = f"{out_stem}-{arange.label}.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = base_cmd +\ f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\ f'{bam_file} > {tx_out_file}' message = f'Splitting {arange.label} regions from {bam_file}.' do.run(cmd, message) bam.index(out_file, dd.get_config(data)) split_files[arange.label] = out_file split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data
def align_to_sort_bam(fastq1, fastq2, aligner, data): """Align to the named genome build, returning a sorted BAM file. """ names = data["rgnames"] align_dir_parts = [data["dirs"]["work"], "align", names["sample"]] if data.get("disambiguate"): align_dir_parts.append(data["disambiguate"]["genome_build"]) align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts)) aligner_indexes = os.path.commonprefix( tz.get_in(("reference", aligner, "indexes"), data)) if aligner_indexes.endswith("."): aligner_indexes = aligner_indexes[:-1] ref_file = tz.get_in(("reference", "fasta", "base"), data) if fastq1.endswith(".bam"): data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file, names, align_dir, data) else: data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes, ref_file, names, align_dir, data) if data["work_bam"] and utils.file_exists(data["work_bam"]): if not data.get("align_split"): bam.index(data["work_bam"], data["config"]) for extra in ["-sr", "-disc"]: extra_bam = utils.append_stem(data['work_bam'], extra) if utils.file_exists(extra_bam): bam.index(extra_bam, data["config"]) return data
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for SomaticIndelDetector. """ base_config = items[0]["config"] for x in align_bams: bam.index(x, base_config) params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"] # Limit per base read start count to between 200-10000, i.e. from any base # can no more 10000 new reads begin. # Further, limit maxNumberOfReads accordingly, otherwise SID discards # windows for high coverage panels. window_size = 200 # default SID value paired = vcfutils.get_paired_bams(align_bams, items) max_depth = min(max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000)), 10000) params += ["--downsample_to_coverage", max_depth] params += ["--maxNumberOfReads", str(int(max_depth) * window_size)] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] # notice there must be at least 4 reads of coverage in normal params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af] else: params += ["--unpaired"] params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None, deep_coverage=False): """Realign a BAM file around indels using GATK, returning sorted BAM. """ runner = broad.runner_from_config(config) bam.index(align_bam, config) runner.run_fn("picard_index_ref", ref_file) ref.fasta_idx(ref_file) if region: align_bam = subset_bam_by_region(align_bam, region, out_file) bam.index(align_bam, config) if has_aligned_reads(align_bam, region): variant_regions = config["algorithm"].get("variant_regions", None) realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, region, out_file, deep_coverage, variant_regions) realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, region, out_file, deep_coverage) # No longer required in recent GATK (> Feb 2011) -- now done on the fly # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam) return realign_bam elif out_file: shutil.copy(align_bam, out_file) return out_file else: return align_bam
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1 --experimental-gls" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def dedup_bam(in_bam, data): """Perform non-stream based deduplication of BAM input files using biobambam. """ if _check_dedup(data): out_file = os.path.join( utils.safe_makedir( os.path.join(os.getcwd(), "align", dd.get_sample_name(data))), "%s-dedup%s" % utils.splitext_plus(os.path.basename(in_bam))) if not utils.file_exists(out_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, out_file) as tx_out_file: bammarkduplicates = config_utils.get_program( "bammarkduplicates", data["config"]) base_tmp = os.path.join( tmpdir, os.path.splitext(os.path.basename(tx_out_file))[0]) cores, mem = _get_cores_memory(data, downscale=2) cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup " "markthreads={cores} I={in_bam} O={tx_out_file}") do.run(cmd.format(**locals()), "De-duplication with biobambam") bam.index(out_file, data["config"]) return out_file else: return in_bam
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info( "Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors." ) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def align_to_sort_bam(fastq1, fastq2, aligner, data): """Align to the named genome build, returning a sorted BAM file. """ names = data["rgnames"] align_dir_parts = [data["dirs"]["work"], "align", names["sample"]] if data.get("disambiguate"): align_dir_parts.append(data["disambiguate"]["genome_build"]) aligner_index = _get_aligner_index(aligner, data) align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts)) ref_file = tz.get_in(("reference", "fasta", "base"), data) if fastq1.endswith(".bam"): data = _align_from_bam(fastq1, aligner, aligner_index, ref_file, names, align_dir, data) else: data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file, names, align_dir, data) if data["work_bam"] and utils.file_exists(data["work_bam"]): if data.get("align_split") and dd.get_mark_duplicates(data): # If merging later with with bamsormadup need query sorted inputs # but CWL requires a bai file. Create a fake one to make it happy. bam.fake_index(data["work_bam"], data) else: bam.index(data["work_bam"], data["config"]) for extra in ["-sr", "-disc"]: extra_bam = utils.append_stem(data['work_bam'], extra) if utils.file_exists(extra_bam): bam.index(extra_bam, data["config"]) return data
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() if gatk_type == "gatk4": params = [ "-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % dd.get_num_cores(data), "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"] ] else: params = [ "-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file ] broad_runner.run_gatk(params, os.path.dirname(tx_out_file)) bam.index(out_file, data["config"]) return out_file
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": covinfo.highdepth, "sample_callable": covinfo.callable, "coverage_depth_bed": covinfo.depth, "avg_coverage": covinfo.avg_coverage } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ if vmulti.bam_needs_processing(data) and data["work_bam"].endswith(".bam"): ref_file = dd.get_ref_file(data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(data["work_bam"], ref_file, data) highdepth_bed = highdepth.identify(data) bam.index(data["work_bam"], data["config"]) sample_callable = callable.sample_callable_bed(data["work_bam"], ref_file, data) offtarget_stats = callable.calculate_offtarget(data["work_bam"], ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": highdepth_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def variantcall_sample(data, region=None, align_bams=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists( out_file): utils.safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if len(align_bams) == 1: items = [data] else: items = multi.get_orig_items(data) assert len(items) == len(align_bams) call_file = "%s-raw%s" % utils.splitext_plus(out_file) assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} for bam_file in align_bams: bam.index(bam_file, data["config"], check_timestamp=False) call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) if region: data["region"] = region data["vrn_file"] = out_file return [data]
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(bam_file_ready, ref_file, data) sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data) offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) highdepth_bed = highdepth.identify(data) data["regions"]["highdepth"] = highdepth_bed if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth in ["low"] else "30.0" region = subset_variant_regions(variant_regions, region, out_file) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", "250", "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += ["--downsample_to_coverage", max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000))] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if data["config"]["algorithm"].get("recalibrate", True) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(data["name"])) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def _piped_input_cl(data, region, tmp_dir, out_base_file, prep_params): """Retrieve the commandline for streaming input into preparation step. If marking duplicates, this requires writing an intermediate file since MarkDuplicates uses multiple passed on an input. """ broad_runner = broad.runner_from_config(data["config"]) cl = _gatk_extract_reads_cl(data, region, prep_params, tmp_dir) if prep_params["dup"] == "picard": sel_file = "%s-select%s" % os.path.splitext(out_base_file) if not utils.file_exists(sel_file): with file_transaction(sel_file) as tx_out_file: cl += ["-o", tx_out_file] do.run(cl, "GATK: PrintReads {0}".format(region), data) dup_metrics = "%s-dup.dup_metrics" % os.path.splitext(out_base_file)[0] compression = "5" if prep_params["realign"] == "gatk" else "0" cl = broad_runner.cl_picard("MarkDuplicates", [("INPUT", sel_file), ("OUTPUT", "/dev/stdout"), ("METRICS_FILE", dup_metrics), ("PROGRAM_RECORD_ID", "null"), ("COMPRESSION_LEVEL", compression), ("TMP_DIR", tmp_dir)]) elif not prep_params["dup"]: sel_file = data["work_bam"] else: raise ValueError("Duplication approach not supported with GATK: %s" % prep_params["dup"]) bam.index(sel_file, data["config"]) return sel_file, " ".join(cl)
def _piped_input_cl(data, region, tmp_dir, out_base_file, prep_params): """Retrieve the commandline for streaming input into preparation step. """ cl = _gatk_extract_reads_cl(data, region, prep_params, tmp_dir) sel_file = data["work_bam"] bam.index(sel_file, data["config"]) return sel_file, " ".join(cl)
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle ] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] params += standard_cl_params(items) broad_runner = broad.runner_from_config(config) return broad_runner, params
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently does not downsample as new versions do not get good sensitivity with downsampled BAMs. """ full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) return [full_bam] ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % ( vcfutils.fix_ambiguous_cl(), tx_out_file) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("samtools", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper() if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: with file_transaction(data, dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(os.path.basename(in_bam))[0]) cmd = ("{samtools} sort -n -@ {cores} -m {mem} -O sam -T {out_base} {in_bam} | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data))) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def piped_bamprep(data, region=None, out_file=None): """Perform full BAM preparation using pipes to avoid intermediate disk IO. Handles recalibration and realignment of original BAMs. """ data["region"] = region if not _need_prep(data): return [data] else: utils.safe_makedir(os.path.dirname(out_file)) if region[0] == "nochrom": prep_bam = shared.write_nochr_reads(data["work_bam"], out_file, data["config"]) elif region[0] == "noanalysis": prep_bam = shared.write_noanalysis_reads(data["work_bam"], region[1], out_file, data["config"]) else: if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: _piped_bamprep_region(data, region, out_file, tmp_dir) prep_bam = out_file bam.index(prep_bam, data["config"]) data["work_bam"] = prep_bam return [data]
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently not used as new versions of delly do not get good sensitivity with downsampled BAMs. """ sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) ds_bam = bam.downsample( dd.get_align_bam(data), data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data): valid_algorithms = ["pseudo", "quasi"] assert algorithm in valid_algorithms, \ "RapMap algorithm needs to be one of %s." % valid_algorithms safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): bam.index(out_file, dd.get_config(data)) return out_file rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data, rapmap_dir) num_cores = dd.get_num_cores(data) algorithm_subcommand = algorithm + "map" rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("%smapping %s and %s to %s with Rapmap. " % (algorithm, fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) bam.index(out_file, dd.get_config(data)) return out_file