def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -") # full BAM -- associate more memory and cores cores, mem = _get_cores_memory(data, downscale=2) # Potentially downsample to maximum coverage here if not splitting and whole genome sample ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools") sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" if ds_cmd: dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file) else: dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals()) # split and discordant BAMs -- give less memory/cores since smaller files sort_opt = "" cores, mem = _get_cores_memory(data, downscale=4) splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals()) # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def _biobambam_merge_maxcov(data): """Combine query sorted BAM files, sort and truncate to maximum coverage. No de-duplication. """ ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") return ("bammerge IL={tx_bam_file_list} tmpfile={tx_out_file}-merge %s > {tx_out_file}" % ds_cmd)
def _biobambam_merge_maxcov(data): """Combine query sorted BAM files, sort and truncate to maximum coverage. No de-duplication. """ ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") return ( "bammerge IL={tx_bam_file_list} tmpfile={tx_out_file}-merge %s > {tx_out_file}" % ds_cmd)
def _biobambam_merge_dedup_maxcov(data): """Combine query sorted BAM files, de-duplicate, sort and truncate to maximum coverage. Handles split files, checking for large scale whole genome coverage where we want to downsample to a maximum coverage. """ ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") return ("bamcat level=0 tmpfile={tx_out_file}-bammerge `cat {tx_bam_file_list}` | " "bamsormadup threads={num_cores} " "tmpfile={tx_out_file}-bamsormaduptmp %s > {tx_out_file}" % ds_cmd)
def _biobambam_merge_dedup_maxcov(data): """Combine query sorted BAM files, de-duplicate, sort and truncate to maximum coverage. Handles split files, checking for large scale whole genome coverage where we want to downsample to a maximum coverage. """ ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") return ( "bamcat level=0 tmpfile={tx_out_file}-bammerge `cat {tx_bam_file_list}` | " "bamsormadup threads={num_cores} " "tmpfile={tx_out_file}-bamsormaduptmp %s > {tx_out_file}" % ds_cmd)
def _biobambam_dedup_sort(data, tx_out_file): """Perform streaming deduplication and sorting with biobambam's bamsormadup """ samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=2) tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] if data.get("align_split"): cmd = "{samtools} sort -n -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" else: ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup " "SO=coordinate %s > {tx_out_file}" % ds_cmd) return cmd.format(**locals())
def _biobambam_dedup_sort(data, tx_out_file): """Perform streaming deduplication and sorting with biobambam's bamsormadup """ samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=2) tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] if data.get("align_split"): sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt else: ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup " "SO=coordinate %s > {tx_out_file}" % ds_cmd) return cmd.format(**locals())
def _biobambam_dedup_sort(data, tx_out_file): """Perform streaming deduplication and sorting with biobambam's bamsormadup """ samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=2) tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] if data.get("align_split"): sort_opt = "-n" if data.get("align_split") and _check_dedup(data) else "" cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt else: # scale core usage to avoid memory issues with larger WGS samples cores = max(1, int(math.ceil(cores * 0.75))) ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup") bamsormadup = config_utils.get_program("bamsormadup", data) cmd = ("{bamsormadup} inputformat=sam threads={cores} tmpfile={tmp_file}-markdup " "SO=coordinate %s > {tx_out_file}" % ds_cmd) return cmd.format(**locals())