def main(args): remove_bam_from_end_re = re.compile("\.bam$") bam_root = remove_bam_from_end_re.sub("", os.path.basename(args.anno_bam)) with tarfile.open(args.rsem_index, "r:gz") as archive: archive.extractall(".", members=make_modified_TarInfo( archive, "rsem_index")) rsem_call = shlex.split( RSEM_COMMAND.format( rnd_seed=args.rnd_seed, ncpus=args.ncpus, ramGB=args.ramGB, fwd_prob=strand_to_fwd_prob(args.read_strand), paired_end=format_endedness(args.endedness), anno_bam=args.anno_bam, bam_root=bam_root, )) logger.info("Running RSEM command %s", " ".join(rsem_call)) subprocess.call(rsem_call) gene_quant_fn = str(bam_root) + "_rsem.genes.results" number_of_genes_detected = calculate_number_of_genes_detected( gene_quant_fn) number_of_genes_detected_dict = { "number_of_genes_detected": number_of_genes_detected } qc_record = QCMetricRecord() number_of_genes_QC = QCMetric("number_of_genes_detected", number_of_genes_detected_dict) qc_record.add(number_of_genes_QC) with open(str(bam_root) + "_number_of_genes_detected.json", "w") as f: json.dump(qc_record.to_ordered_dict(), f)
def main(args): quant1 = pd.read_csv(args.quants[0], sep="\t", header=None, skiprows=4) quant2 = pd.read_csv(args.quants[1], sep="\t", header=None, skiprows=4) spearman_correlation = quant1[1].corr(quant2[1], method="spearman") qc_record = QCMetricRecord() spearman_metric = QCMetric("spearman_correlation", {"spearman_correlation": spearman_correlation}) qc_record.add(spearman_metric) with open(args.output_filename, "w") as fp: json.dump(qc_record.to_ordered_dict(), fp)
def main(args): logger.info("Reading input tsv: %s" % args.quants) quants_tsv = pd.read_csv(args.quants, sep="\t", header=None, skiprows=4) # calculate number of mirnas expressed at cpm>2 per_million = quants_tsv[1].sum() / 1000000 quants_tsv["cpm"] = quants_tsv[1] / per_million cpm_gte2 = sum(quants_tsv["cpm"] >= 2) star_qc_record = QCMetricRecord() cpm_metric = QCMetric("expressed_mirnas", {"expressed_mirnas": cpm_gte2}) # get metrics from star log star_qc = QCMetric("star_qc_metric", args.star_log, parse_starlog) star_qc_record.add_all([cpm_metric, star_qc]) # calculate number of reads (unique + multimapping) reads_mapped = int(star_qc.content["Uniquely mapped reads number"]) + int( star_qc.content["Number of reads mapped to multiple loci"] ) reads_mapped_qc = QCMetric("aligned_reads", {"aligned_reads": reads_mapped}) star_qc_record.add(reads_mapped_qc) logger.info("Writing output json %s" % args.output_filename) with open(args.output_filename, "w") as fp: json.dump(star_qc_record.to_ordered_dict(), fp)
def main(args): qc_record = QCMetricRecord() logger.info( "Reading transcript id to gene type mapping from %s", args.tr_id_to_gene_type_tsv, ) tr_to_gene_type_map = read_dict_from_tsv(args.tr_id_to_gene_type_tsv) logger.info("Calculating gene type counts for bam %s", args.input_bam) gene_type_counts = get_gene_type_counts(tr_to_gene_type_map, args.input_bam) gene_type_counts = QCMetric("gene_type_count", gene_type_counts) qc_record.add(gene_type_counts) logger.info("Writing QC output into %s", args.output_filename) with open(args.output_filename, "wt") as fp: json.dump(qc_record.to_ordered_dict(), fp)
def main(args): abundance = pd.read_csv(args.abundance, sep="\t") abundance_filtered = filter_startswith_prefix( remove_genomic_transcripts(abundance), args.idprefix) gene_counts = calculate_abundances_aggregated_by_gene( abundance_filtered, args.counts_colname) number_of_genes_detected = sum(gene_counts >= 1) number_of_genes_record = QCMetricRecord() number_of_genes_metric = QCMetric( "number_of_genes_detected", {"number_of_genes_detected": number_of_genes_detected}, ) number_of_genes_record.add(number_of_genes_metric) with open(args.outfile, "w") as fp: json.dump(number_of_genes_record.to_ordered_dict(), fp)
def main(args): rep1_abundance = pd.read_csv(args.rep1_abundance, sep="\t") rep2_abundance = pd.read_csv(args.rep2_abundance, sep="\t") rep1_filtered = filter_startswith_prefix( remove_genomic_transcripts(rep1_abundance), args.rep1_idprefix) rep2_filtered = filter_startswith_prefix( remove_genomic_transcripts(rep2_abundance), args.rep2_idprefix) del rep1_abundance del rep2_abundance rep1_counts = calculate_abundances_aggregated_by_gene( rep1_filtered, rep1_filtered.columns[-1]) rep2_counts = calculate_abundances_aggregated_by_gene( rep2_filtered, rep2_filtered.columns[-1]) del rep1_filtered del rep2_filtered aligned_counts = rep1_counts.align(rep2_counts, join="outer", fill_value=0) spearman = aligned_counts[0].corr(aligned_counts[1], method="spearman") correlation_qc = QCMetric("replicates_correlation", {"spearman_correlation": spearman}) spearman_record = QCMetricRecord([correlation_qc]) with open(args.outfile, "w") as fp: json.dump(spearman_record.to_ordered_dict(), fp)
def test_get_content(): qc_obj = QCMetric("_", {2: "a", 1: "b"}) assert qc_obj.content == OrderedDict([(1, "b"), (2, "a")])
def obj_a1(): return QCMetric("a", {1: 2})
def obj_a2(): return QCMetric("a", {2: 3})
def main(args): merged_R2 = None if len(args.fastqs_R1) > 1: merged_R1 = concatenate_files(args.fastqs_R1) else: merged_R1 = args.fastqs_R1[0] if args.endedness == "paired" and len(args.fastqs_R2) > 1: merged_R2 = concatenate_files(args.fastqs_R2) elif args.endedness == "paired" and len(args.fastqs_R2) == 1: merged_R2 = args.fastqs_R2[0] fastqs = [merged_R1] if merged_R2 and args.endedness == "paired": fastqs.append(merged_R2) with tarfile.open(args.index, "r:gz") as archive: archive.extractall() aligner = make_aligner(args.endedness, fastqs, args.ncpus, args.ramGB, args.indexdir) aligner.run() cwd = os.getcwd() genome_bam_path = os.path.join(cwd, args.bamroot + "_genome.bam") anno_bam_path = os.path.join(cwd, args.bamroot + "_anno.bam") genome_flagstat_path = os.path.join(cwd, args.bamroot + "_genome_flagstat.txt") anno_flagstat_path = os.path.join(cwd, args.bamroot + "_anno_flagstat.txt") star_log_path = os.path.join(cwd, args.bamroot + "_Log.final.out") os.rename(os.path.join(cwd, "Aligned.sortedByCoord.out.bam"), genome_bam_path) os.rename(os.path.join(cwd, "Log.final.out"), star_log_path) rsem_check_cmd = "rsem-sam-validator {bam_to_check}".format( bam_to_check="Aligned.toTranscriptome.out.bam") rsem_output = subprocess.check_output(shlex.split(rsem_check_cmd)) # rsem validator exits with 0 whether the check passes or not # for this reason we check if the output ends in 'is valid!' # the other possibility is 'is not valid!' rsem_valid = rsem_output.decode().strip().split("\n")[-1].endswith( "is valid!") if rsem_valid: logger.info("Transcriptome bam is already rsem-sorted.") os.rename(os.path.join(cwd, "Aligned.toTranscriptome.out.bam"), anno_bam_path) else: logger.info("Transcriptome bam is not rsem-sorted.") rsem_sort_cmd = "convert-sam-for-rsem {input} {output}".format( input="Aligned.toTranscriptome.out.bam", output=args.bamroot + "_anno") logger.info("Running %s", rsem_sort_cmd) subprocess.call(shlex.split(rsem_sort_cmd)) get_flagstats(genome_bam_path, genome_flagstat_path) get_flagstats(anno_bam_path, anno_flagstat_path) anno_flagstat_content = parse_flagstats(anno_flagstat_path) genome_flagstat_content = parse_flagstats(genome_flagstat_path) star_log_content = parse_starlog(star_log_path) anno_flagstat_qc = QCMetric("samtools_anno_flagstat", anno_flagstat_content) genome_flagstat_qc = QCMetric("samtools_genome_flagstat", genome_flagstat_content) star_log_qc = QCMetric("star_log_qc", star_log_content) write_json( anno_flagstat_qc.to_ordered_dict(), re.sub(r"\.txt$", ".json", anno_flagstat_path), ) write_json( genome_flagstat_qc.to_ordered_dict(), re.sub(r"\.txt$", ".json", genome_flagstat_path), ) write_json(star_log_qc.to_ordered_dict(), re.sub(r"\.out$", ".json", star_log_path))
def test_equals(): first_obj = QCMetric("a", {}) second_obj = QCMetric("a", {"x": "y"}) assert first_obj == second_obj
def test_QCMetric_repr(): obj = QCMetric("a", {1: "x"}) assert obj.__repr__() == "QCMetric('a', OrderedDict([(1, 'x')]))"
def test_len_0(): assert len(QCMetric("a", {})) == 0
def test_less_than(): smaller_obj = QCMetric(1, {}) bigger_obj = QCMetric(2, {}) assert smaller_obj < bigger_obj
def obj_b(): return QCMetric("b", {3: 4})
def test_get_name(): qc_obj = QCMetric("a", {}) assert qc_obj.name == "a"
def test_type_check(): with pytest.raises(TypeError): QCMetric("name", 1)
def obj_d(): return QCMetric("d", {"a": "b"})
def obj_c(): return QCMetric("c", {1: 2, 3: 4, 5: 6})