def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("samtools", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper() if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: with file_transaction(data, dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(os.path.basename(in_bam))[0]) cmd = ("{samtools} sort -n -@ {cores} -m {mem} -O sam -T {out_base} {in_bam} | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): shutil.copy(bam_files[0], out_file) else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction( config, "%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-") tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def sort(in_bam, config, order="coordinate", out_dir=None): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order, out_dir) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) # Slightly decrease memory and allow more accurate representation # in Mb to ensure fits within systems like SLURM mem = config_utils.adjust_memory(resources.get("memory", "2G"), 1.25, "decrease", out_modifier="M").upper() cmd = ("{samtools} sort -@ {cores} -m {mem} -O BAM {order_flag} " "-T {tx_sort_stem}-sort -o {tx_sort_file} {in_bam}") do.run(cmd.format(**locals()), "Sort BAM file %s: %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any( tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps( dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) if library != "non-strand-specific": logger.info( "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation." ) library = "non-strand-specific" paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ( "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samblaster = config_utils.get_program("samblaster", data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file): with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with utils.curdir_tmpdir() as tmpdir: tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | " "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} " "-o {out_file} /dev/stdin") splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals()) cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} " "-n -o /dev/stdout -l 0 {in_bam} | " "{sambamba} view -h /dev/stdin | " "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "-o /dev/null") do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data) return sr_file, disc_file
def sort(in_bam, config, order="coordinate"): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} " "{in_bam} {tx_sort_stem}") if sambamba: if tz.get_in(["resources", "sambamba"], config): sm_resources = config_utils.get_resources( "sambamba", config) mem = sm_resources.get("memory", "2G") # sambamba uses total memory, not memory per core mem = config_utils.adjust_memory(mem, cores, "increase").upper() # Use samtools compatible natural sorting # https://github.com/lomereiter/sambamba/issues/132 order_flag = "--natural-sort" if order == "queryname" else "" cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} " "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}") else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run( cmd.format(**locals()), "Sort BAM file (multi core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) except: logger.exception( "Multi-core sorting failed, reverting to single core") resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") cores = 1 order_flag = "-n" if order == "queryname" else "" do.run( samtools_cmd.format(**locals()), "Sort BAM file (single core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def _get_cores_memory(data, downscale=2): """Retrieve cores and memory, using samtools as baseline. For memory, scaling down because we share with alignment and de-duplication. """ resources = config_utils.get_resources("samtools", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), downscale, "decrease").upper() return num_cores, max_mem
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources( "samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def sort_cmd(config, tmp_dir, named_pipe=None, order="coordinate"): """ Get a sort command, suitable for piping """ sambamba = _get_sambamba(config) pipe = named_pipe if named_pipe else "/dev/stdin" order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 1, "decrease").upper() cmd = ("{sambamba} sort -m {mem} --tmpdir {tmp_dir} -t {num_cores} {order_flag} -o /dev/stdout {pipe}") return cmd.format(**locals())
def _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file=None, single_end=None): """ Create command lines for qualimap """ qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "4G"), num_cores) cmd = ("unset DISPLAY && {qualimap} rnaseq -outdir {out_dir} -a proportional -bam {bam_file} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0] ) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) samtools = config_utils.get_program("samtools", data["config"]) bwa = config_utils.get_program("bwa", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not can_pipe(fastq_file, data): return align(fastq_file, pair_file, ref_file, names, align_dir, data) else: with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ( "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) cmd = cmd.format(**locals()) do.run( cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file) ]) data["work_bam"] = out_file return data
def sort(in_bam, config, order="coordinate"): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} " "{in_bam} {tx_sort_stem}") if sambamba: if tz.get_in(["resources", "sambamba"], config): sm_resources = config_utils.get_resources("sambamba", config) mem = sm_resources.get("memory", "2G") # sambamba uses total memory, not memory per core mem = config_utils.adjust_memory(mem, cores, "increase").upper() # Use samtools compatible natural sorting # https://github.com/lomereiter/sambamba/issues/132 order_flag = "--natural-sort" if order == "queryname" else "" cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} " "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}") else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Sort BAM file (multi core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) except: logger.exception("Multi-core sorting failed, reverting to single core") resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") cores = 1 order_flag = "-n" if order == "queryname" else "" do.run(samtools_cmd.format(**locals()), "Sort BAM file (single core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) cmd = ("unset DISPLAY && {qualimap} rnaseq -outdir {out_dir} -a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, data["config"]) sambamba = config_utils.get_program( "sambamba", data["config"]) samtools = config_utils.get_program( "samtools", data["config"]) resources = config_utils.get_resources( "samtools", data["config"]) num_cores = dd.get_num_cores(data) # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if dd.get_mark_duplicates(data): cmd = _biobambam_merge_dedup_maxcov(data) else: cmd = _biobambam_merge_maxcov(data) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: bam.index(bam_files[0], config) return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) samblaster = config_utils.get_program("samblaster", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"): with file_transaction(config, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, config) shutil.copy(bam_files[0], tx_out_file) samtools = config_utils.get_program("samtools", config) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config) sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run('{} quickcheck -v {}'.format(samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, config) bam.index(out_file, config) return out_file
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) samtools = config_utils.get_program("samtools", data["config"]) bwa = config_utils.get_program("bwa", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not can_pipe(fastq_file, data): return align(fastq_file, pair_file, ref_file, names, align_dir, data) else: with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with utils.curdir_tmpdir({"config": config}) as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: tx_out_prefix = os.path.splitext(tx_out_file)[0] with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, data["config"]) samtools = config_utils.get_program("samtools", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = dd.get_num_cores(data) # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() if dd.get_mark_duplicates(data): cmd = _biobambam_merge_dedup_maxcov(data) else: cmd = _biobambam_merge_maxcov(data) do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run('{} quickcheck -v {}'.format(samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = data["genome_resources"]["aliases"].get("ensembl", "").upper() if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = data["config"]["algorithm"].get("variant_regions") if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)