def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-forward", "secondstrand": "strand-specific-reverse", "unstranded": "non-strand-specific", "auto": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) library = strandedness[dd.get_strandedness(data)] # don't run qualimap on the full bam by default if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info(f"Full qualimap analysis for {bam_file} may be slow.") ds_bam = bam_file else: logger.info(f"Downsampling {bam_file} for Qualimap run.") ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update( {"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update( {"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update({"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics}