def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -") # full BAM -- associate more memory and cores cores, mem = _get_cores_memory(data, downscale=2) # Potentially downsample to maximum coverage here if not splitting and whole genome sample ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools") sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" if ds_cmd: dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file) else: dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals()) # split and discordant BAMs -- give less memory/cores since smaller files sort_opt = "" cores, mem = _get_cores_memory(data, downscale=4) splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals()) # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def align_to_sort_bam(fastq1, fastq2, aligner, data): """Align to the named genome build, returning a sorted BAM file. """ names = data["rgnames"] align_dir_parts = [data["dirs"]["work"], "align", names["sample"]] if data.get("disambiguate"): align_dir_parts.append(data["disambiguate"]["genome_build"]) aligner_index = _get_aligner_index(aligner, data) align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts)) ref_file = tz.get_in(("reference", "fasta", "base"), data) if fastq1.endswith(".bam"): data = _align_from_bam(fastq1, aligner, aligner_index, ref_file, names, align_dir, data) else: data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file, names, align_dir, data) if data["work_bam"] and utils.file_exists(data["work_bam"]): if data.get("align_split") and dd.get_mark_duplicates(data): # If merging later with with bamsormadup need query sorted inputs # but CWL requires a bai file. Create a fake one to make it happy. bam.fake_index(data["work_bam"], data) else: bam.index(data["work_bam"], data["config"]) for extra in ["-sr", "-disc"]: extra_bam = utils.append_stem(data['work_bam'], extra) if utils.file_exists(extra_bam): bam.index(extra_bam, data["config"]) return data
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info( "Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors." ) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, data["config"]) sambamba = config_utils.get_program( "sambamba", data["config"]) samtools = config_utils.get_program( "samtools", data["config"]) resources = config_utils.get_resources( "samtools", data["config"]) num_cores = dd.get_num_cores(data) # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if dd.get_mark_duplicates(data): cmd = _biobambam_merge_dedup_maxcov(data) else: cmd = _biobambam_merge_maxcov(data) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = [ "platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1" ] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = ( " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _biobambam_dedup_sort(data, tx_out_file): """Perform streaming deduplication and sorting with biobambam's bamsormadup """ samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=2) tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] if data.get("align_split"): sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt else: ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup " "SO=coordinate %s > {tx_out_file}" % ds_cmd) return cmd.format(**locals())
def _biobambam_dedup_sort(data, tx_out_file): """Perform streaming deduplication and sorting with biobambam's bamsormadup """ samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=2) tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] if data.get("align_split"): sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt else: ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") bamsormadup = config_utils.get_program("bamsormadup", data) cmd = ("{bamsormadup} inputformat=sam threads={cores} tmpfile={tmp_file}-markdup " "SO=coordinate %s > {tx_out_file}" % ds_cmd) return cmd.format(**locals())
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = (" | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(items[0]), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, data["config"]) samtools = config_utils.get_program("samtools", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = dd.get_num_cores(data) # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() if dd.get_mark_duplicates(data): cmd = _biobambam_merge_dedup_maxcov(data) else: cmd = _biobambam_merge_maxcov(data) do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run('{} quickcheck -v {}'.format(samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: # Do not want to re-run duplicate marking after realignment data["config"]["algorithm"]["orig_markduplicates"] = dd.get_mark_duplicates(data) data = dd.set_mark_duplicates(data, False) torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: # Do not want to re-run duplicate marking after realignment data["config"]["algorithm"][ "orig_markduplicates"] = dd.get_mark_duplicates(data) data = dd.set_mark_duplicates(data, False) torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])
def _skip_duplicates(data): return (dd.get_coverage_interval(data) == "amplicon" or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
def _skip_duplicates(data): return dd.get_coverage_interval(data) == "amplicon" or not dd.get_mark_duplicates(data)
def _skip_duplicates(data): return dd.get_coverage_interval( data) == "amplicon" or not dd.get_mark_duplicates(data)