def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} report_file = os.path.join(out_dir, "qualimapReport.html") raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning("kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) sentinel_file = os.path.join(quant_dir, "abundance.h5") if os.path.exists(sentinel_file): return quant_dir num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode( data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning( "kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-forward", "secondstrand": "strand-specific-reverse", "unstranded": "non-strand-specific", "auto": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) library = strandedness[dd.get_strandedness(data)] # don't run qualimap on the full bam by default if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info(f"Full qualimap analysis for {bam_file} may be slow.") ds_bam = bam_file else: logger.info(f"Downsampling {bam_file} for Qualimap run.") ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update( {"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): safe_makedir(salmon_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(salmon_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish._create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--numBootstraps 30 --useVBOpt " with file_transaction(data, salmon_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): with file_transaction(data, results_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt") bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def bcbio_run(data): out_dir = os.path.join(dd.get_work_dir(data), "dexseq") safe_makedir(out_dir) sample_name = dd.get_sample_name(data) out_file = os.path.join(out_dir, sample_name + ".dexseq") bam_file = dd.get_work_bam(data) dexseq_gff = dd.get_dexseq_gff(data) stranded = dd.get_strandedness(data) return run_count(bam_file, dexseq_gff, stranded, out_file, data)
def _get_stranded_flag(data): strand_flag = {"unstranded": "no", "firststrand": "reverse", "secondstrand": "yes"} stranded = dd.get_strandedness(data, "unstranded").lower() assert stranded in strand_flag, ("%s is not a valid strandedness value. " "Valid values are 'firststrand', 'secondstrand', " "and 'unstranded") return strand_flag[stranded]
def _strand_flag(data): """ 0: unstranded 1: stranded 2: reverse stranded """ strand_flag = {"unstranded": "0", "firststrand": "2", "secondstrand": "1"} stranded = dd.get_strandedness(data) assert stranded in strand_flag, ( "%s is not a valid strandedness value. " "Valid values are 'firststrand', 'secondstrand', " "and 'unstranded" ) return strand_flag[stranded]
def _strand_flag(data): """ 0: unstranded 1: stranded 2: reverse stranded """ strand_flag = {"unstranded": "0", "firststrand": "2", "secondstrand": "1"} stranded = dd.get_strandedness(data) assert stranded in strand_flag, ( "%s is not a valid strandedness value. " "Valid values are 'firststrand', 'secondstrand', " "and 'unstranded") return strand_flag[stranded]
def _get_stranded_flag(data): strand_flag = { "unstranded": "no", "firststrand": "reverse", "secondstrand": "yes" } stranded = dd.get_strandedness(data, "unstranded").lower() assert stranded in strand_flag, ( "%s is not a valid strandedness value. " "Valid values are 'firststrand', 'secondstrand', " "and 'unstranded") return strand_flag[stranded]
def _set_stranded_flag(bam_file, data): strand_flag = {"unstranded": "", "firststrand": "--rf-stranded", "secondstrand": "--fr-stranded", "firststrand-s": "--r-stranded", "secondstrand-s": "--f-stranded"} stranded = dd.get_strandedness(data) assert stranded in strand_flag, ("%s is not a valid strandedness value. " "Valid values are 'firststrand', " "'secondstrand' and 'unstranded" % (stranded)) if stranded != "unstranded" and not is_paired(bam_file): stranded += "-s" flag = strand_flag[stranded] return flag
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update( {"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def _get_stranded_flag(data, paired): strandedness = dd.get_strandedness(data) base = "--rna-strandness " if paired: if strandedness == "firststrand": return base + "RF" elif strandedness == "secondstrand": return base + "FR" else: return "" else: if strandedness == "firstrand": return base + "R" elif strandedness == "secondstrand": return base + "F" else: return ""
def _set_stranded_flag(bam_file, data): strand_flag = { "unstranded": "", "firststrand": "--rf-stranded", "secondstrand": "--fr-stranded", "firststrand-s": "--r-stranded", "secondstrand-s": "--f-stranded" } stranded = dd.get_strandedness(data) assert stranded in strand_flag, ("%s is not a valid strandedness value. " "Valid values are 'firststrand', " "'secondstrand' and 'unstranded" % (stranded)) if stranded != "unstranded" and not is_paired(bam_file): stranded += "-s" flag = strand_flag[stranded] return flag
def run_sailfish(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None sailfish_dir = os.path.join(work_dir, "sailfish", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file stranded = dd.get_strandedness(data).lower() out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, sailfish_dir) return [[data]]
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) # unsure how to estimate from single end data, so go with a reasonable default frag_length = 250 batch_file = umi.convert_to_kallisto(data) index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir) cmd = ("{kallisto} pseudo --umi " "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}") with chdir(os.path.dirname(batch_file)): with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with Kallisto.") do.run(cmd.format(**locals()), message, None) kallisto_table(kallisto_dir, index) return quant_dir
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data): safe_makedir(salmon_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(salmon_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish._create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = _libtype_string(bam_file, strandedness) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} " "-o {tx_out_dir} -a {bam_file} ") cmd += "--numBootstraps 30 " with file_transaction(data, salmon_dir) as tx_out_dir: message = "Quantifying transcripts in %s with Salmon." % bam_file do.run(cmd.format(**locals()), message, None) return out_file
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update({"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics}
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) report_file = os.path.join(results_dir, "qualimapReport.html") raw_file = os.path.join(results_dir, "rnaseq_qc_results.txt") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): utils.safe_makedir(results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update( {"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info("Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), dd.get_strandedness(data))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(data, out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(data, stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info("Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), dd.get_strandedness(data))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(data, out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(data, stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file