def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. """ metrics = {} to_run = [("fastqc", _run_fastqc)] if data["analysis"].lower().startswith("rna-seq"): to_run.append(("rnaseqc", bcbio.rnaseq.qc.sample_summary)) # to_run.append(("coverage", _run_gene_coverage)) to_run.append(("complexity", _run_complexity)) elif data["analysis"].lower().startswith("chip-seq"): to_run.append(["bamtools", _run_bamtools_stats]) else: to_run += [("bamtools", _run_bamtools_stats), ("gemini", _run_gemini_stats)] qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} for program_name, qc_fn in to_run: cur_qc_dir = os.path.join(qc_dir, program_name) cur_metrics = qc_fn(bam_file, data, cur_qc_dir) metrics.update(cur_metrics) ratio = bam.get_aligned_reads(bam_file,data) if ratio < 0.60 and data['config']["algorithm"].get("kraken", False) and data["analysis"].lower() == "rna-seq": cur_metrics =_run_kraken(data, ratio) metrics.update(cur_metrics) metrics["Name"] = data["name"][-1] metrics["Quality format"] = utils.get_in(data, ("config", "algorithm", "quality_format"), "standard").lower() return {"qc": qc_dir, "metrics": metrics}
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ metrics = {} to_run = [] if "fastqc" not in tz.get_in(("config", "algorithm", "tools_off"), data, []): to_run.append(("fastqc", _run_fastqc)) if data["analysis"].lower().startswith("rna-seq"): # to_run.append(("rnaseqc", bcbio.rnaseq.qc.sample_summary)) # to_run.append(("coverage", _run_gene_coverage)) # to_run.append(("complexity", _run_complexity)) to_run.append(("qualimap", _rnaseq_qualimap)) elif data["analysis"].lower().startswith("chip-seq"): to_run.append(["bamtools", _run_bamtools_stats]) elif not data["analysis"].lower().startswith("smallrna-seq"): to_run += [("bamtools", _run_bamtools_stats), ("gemini", _run_gemini_stats)] if data["analysis"].lower().startswith(("standard", "variant2")): to_run.append(["qsignature", _run_qsignature_generator]) if "qualimap" in tz.get_in(("config", "algorithm", "tools_on"), data, []): to_run.append(("qualimap", _run_qualimap)) qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} for program_name, qc_fn in to_run: cur_qc_dir = os.path.join(qc_dir, program_name) cur_metrics = qc_fn(bam_file, data, cur_qc_dir) metrics.update(cur_metrics) # if (ratio < 0.60 and data['config']["algorithm"].get("kraken", None) and # (data["analysis"].lower().startswith("rna-seq") or # data["analysis"].lower().startswith("standard"))): if data['config']["algorithm"].get("kraken", None): ratio = bam.get_aligned_reads(bam_file, data) cur_metrics = _run_kraken(data, ratio) metrics.update(cur_metrics) bam.remove("%s-downsample%s" % os.path.splitext(bam_file)) metrics["Name"] = data["name"][-1] metrics["Quality format"] = utils.get_in( data, ("config", "algorithm", "quality_format"), "standard").lower() return {"qc": qc_dir, "metrics": metrics}
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ metrics = {} to_run = [] if "fastqc" not in tz.get_in(("config", "algorithm", "tools_off"), data, []): to_run.append(("fastqc", _run_fastqc)) if data["analysis"].lower().startswith("rna-seq"): # to_run.append(("rnaseqc", bcbio.rnaseq.qc.sample_summary)) # to_run.append(("coverage", _run_gene_coverage)) # to_run.append(("complexity", _run_complexity)) to_run.append(("qualimap", _rnaseq_qualimap)) elif data["analysis"].lower().startswith("chip-seq"): to_run.append(["bamtools", _run_bamtools_stats]) elif not data["analysis"].lower().startswith("smallrna-seq"): to_run += [("bamtools", _run_bamtools_stats), ("gemini", _run_gemini_stats)] if data["analysis"].lower().startswith(("standard", "variant2")): to_run.append(["qsignature", _run_qsignature_generator]) if "qualimap" in tz.get_in(("config", "algorithm", "tools_on"), data, []): to_run.append(("qualimap", _run_qualimap)) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} for program_name, qc_fn in to_run: cur_qc_dir = os.path.join(qc_dir, program_name) cur_metrics = qc_fn(bam_file, data, cur_qc_dir) metrics.update(cur_metrics) # if (ratio < 0.60 and data['config']["algorithm"].get("kraken", None) and # (data["analysis"].lower().startswith("rna-seq") or # data["analysis"].lower().startswith("standard"))): if data['config']["algorithm"].get("kraken", None): ratio = bam.get_aligned_reads(bam_file, data) cur_metrics = _run_kraken(data, ratio) metrics.update(cur_metrics) bam.remove("%s-downsample%s" % os.path.splitext(bam_file)) metrics["Name"] = data["name"][-1] metrics["Quality format"] = utils.get_in(data, ("config", "algorithm", "quality_format"), "standard").lower() return {"qc": qc_dir, "metrics": metrics}
def run(bam_file, data, out_dir): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio)) logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data)) ratio = bam.get_aligned_reads(bam_file, data) out = out_stats = None db = data['config']["algorithm"]["kraken"] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(out_dir, "kraken_out")): work_dir = os.path.dirname(out_dir) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files_orig"][0] if dd.get_save_diskspace( data) else data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fasta files as input") return {"kraken_report": "null"} with tx_tmpdir(data) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--out {out} --fastq-input /dev/stdin 2> {out_stats}" ).format(**locals()) do.run(cl, "kraken: %s" % dd.get_sample_name(data)) if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_tmp_dir, out_dir) metrics = _parse_kraken_output(out_dir, db, data) return metrics
def run(bam_file, data, out_dir): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio)) logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data)) ratio = bam.get_aligned_reads(bam_file, data) out = out_stats = None db = data['config']["algorithm"]["kraken"] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(out_dir, "kraken_out")): work_dir = os.path.dirname(out_dir) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files_orig"][0] if dd.get_save_diskspace(data) else data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fastq files as input") return {"kraken_report": "null"} with tx_tmpdir(data) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--out {out} --fastq-input /dev/stdin 2> {out_stats}").format(**locals()) do.run(cl, "kraken: %s" % dd.get_sample_name(data)) if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_tmp_dir, out_dir) metrics = _parse_kraken_output(out_dir, db, data) return metrics