def process_aln_stats(path): pd = picard.parse(path) raw = next(x for x in pd["metrics"]["contents"] if x["CATEGORY"] == "PAIR") aln_proper_pairs = (raw["READS_ALIGNED_IN_PAIRS"] - raw["PF_READS_IMPROPER_PAIRS"]) return { "num_aligned_bases": raw["PF_ALIGNED_BASES"], "num_aligned_reads": raw["PF_READS_ALIGNED"], "num_aligned_reads_proper_pairs": aln_proper_pairs, "num_total_reads": raw["TOTAL_READS"], "pct_adapter": raw["PCT_ADAPTER"], "pct_aligned_reads_from_total": (raw["PF_READS_ALIGNED"] * 100. / raw["TOTAL_READS"]), "pct_aligned_reads_proper_pairs": (aln_proper_pairs * 100. / raw["PF_READS_ALIGNED"]), "pct_chimeras": raw["PCT_CHIMERAS"], "rate_indel": raw["PF_INDEL_RATE"], "rate_mismatch": raw["PF_MISMATCH_RATE"], "strand_balance": raw["STRAND_BALANCE"], }
def write_aggregated_picard_metrics_by_row(file_names, output_name): """Command line entrypoint to parse, aggreagete and write Picard row metrics. Parameters ---------- args: file_names: array of files. the basename of inputs should be formated as 'samplename_qc',such as "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt" output_name: prefix of output file name without extension. Returns ---------- return: 0 return if the program completes successfully. """ # initial output metrics = {} d = pd.DataFrame() for file_name in file_names: cell_id = os.path.basename(file_name).split('_qc')[0] metrics[cell_id] = {} parsed = picard.parse(file_name) class_name = parsed['metrics']['class'].split('.')[2] # Alignment metrics return multiple lines, # but only output PAIRED-READS/third line contents = parsed['metrics']['contents'] if class_name == "AlignmentSummaryMetrics": # parse out PE, R1 and R2. If the reads are unpaired, the contents # will be a single dict rather than a list of dicts. if isinstance(contents, dict): contents = [contents] rows = {} for m in contents: cat = m['CATEGORY'] rows.update({ k + '.' + cat: v for k, v in m.items() if k not in ['SAMPLE', 'LIBRARY', 'READ_GROUP', 'CATEGORY'] }) # sometimes(very rare), insertion metrics also return multiple lines # results to include TANDEM repeats. but we only output the first line. elif class_name == "InsertSizeMetrics": # if the element counts is less than 21, # it means insertion metrics returns multiple line results. if len(contents) < 21: rows = contents[0] else: rows = contents else: # other metrics(so far) only return one line results. rows = contents metrics[cell_id].update({ k: rows[k] for k in rows if k not in ['SAMPLE', 'LIBRARY', 'READ_GROUP', 'CATEGORY'] }) df = pd.DataFrame.from_dict(metrics, orient='columns') df.insert(0, 'Class', class_name) d = d.append(df) d_T = d.T d_T.to_csv(output_name + '.csv')
def process_insert_stats(path): pd = picard.parse(path) raw = pd["metrics"]["contents"] # Raw can be a list if there are more than one PAIR_ORIENTATION. if isinstance(raw, list): raw = next(r for r in raw if r["PAIR_ORIENTATION"] == "FR") return { "median_absolute_deviation": raw["MEDIAN_ABSOLUTE_DEVIATION"], "median_insert_size": raw["MEDIAN_INSERT_SIZE"], "min_insert_size": raw["MIN_INSERT_SIZE"], "max_insert_size": raw["MAX_INSERT_SIZE"], }
def hisat_check(dirw, ilist, olist, diro1, diro2, paired): from crimson import picard os.chdir(dirw) assert op.isfile(ilist), "%s not exist" % ilist ary = np.genfromtxt(ilist, names = True, dtype = object, delimiter = "\t") cols = list(ary.dtype.names) fho = open(olist, "w") if paired: fho.write("\t".join(cols + ["BAM", "Pair", "Pair_Map", "Pair_Orphan", "Pair_Unmap", \ "Pair_Map_Hq", "Unpair", "Unpair_Map", "Unpair_Map_Hq"])+"\n") else: fho.write("\t".join(cols + ["BAM", "Total", "Mapped", "Mapped_Hq"]) + "\n") for row in ary: row = [str(x, 'utf-8') for x in list(row)] sid = row[0] bam = "%s/%s.bam" % (diro1, sid) assert op.isfile(bam), "%s not exist" % bam fs = "%s/%s.sum.txt" % (diro2, sid) rs1 = picard.parse(fs)['metrics']['contents'] if type(rs1) == dict: rs1 = [rs1] rs = { rs1[i]['CATEGORY']: rs1[i] for i in list(range(len(rs1))) } if paired: f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11] pair = rs['FIRST_OF_PAIR']['TOTAL_READS'] pair_map = rs['FIRST_OF_PAIR']['READS_ALIGNED_IN_PAIRS'] pair_map1 = rs['FIRST_OF_PAIR']['PF_READS_ALIGNED'] pair_map_hq1 = rs['FIRST_OF_PAIR']['PF_HQ_ALIGNED_READS'] pair_map2 = rs['SECOND_OF_PAIR']['PF_READS_ALIGNED'] pair_map_hq2 = rs['SECOND_OF_PAIR']['PF_HQ_ALIGNED_READS'] unpair = rs['UNPAIRED']['TOTAL_READS'] unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED'] unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS'] pair_orphan = pair_map1 + pair_map2 - pair_map * 2 pair_unmap = pair - pair_map - pair_orphan pair_map_hq = int((pair_map_hq1+pair_map_hq2)/2) assert pair == int(rrc), "error 1: %d %s" % (pair, rrc) assert int(rc1)+int(rc2) == unpair, "error 2" stats = map(str, [pair, pair_map, pair_orphan, pair_unmap, \ pair_map_hq, unpair, unpair_map, unpair_map_hq]) fho.write("\t".join(row + [bam] + list(stats)) + "\n") else: fr, rc, ft, rrc = row[1:5] unpair = rs['UNPAIRED']['TOTAL_READS'] unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED'] unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS'] stats = map(str, [unpair, unpair_map, unpair_map_hq]) fho.write("\t".join(row + [bam] + list(stats)) + "\n")
def hisat_check(dirw, ilist, olist, diro1, diro2, paired): from crimson import picard os.chdir(dirw) assert op.isfile(ilist), "%s not exist" % ilist ary = np.genfromtxt(ilist, names=True, dtype=object, delimiter="\t") cols = list(ary.dtype.names) fho = open(olist, "w") if paired: fho.write("\t".join(cols + ["BAM", "Pair", "Pair_Map", "Pair_Orphan", "Pair_Unmap", \ "Pair_Map_Hq", "Unpair", "Unpair_Map", "Unpair_Map_Hq"])+"\n") else: fho.write("\t".join(cols + ["BAM", "Total", "Mapped", "Mapped_Hq"]) + "\n") for row in ary: row = [str(x, 'utf-8') for x in list(row)] sid = row[0] bam = "%s/%s.bam" % (diro1, sid) assert op.isfile(bam), "%s not exist" % bam fs = "%s/%s.sum.txt" % (diro2, sid) rs1 = picard.parse(fs)['metrics']['contents'] if type(rs1) == dict: rs1 = [rs1] rs = {rs1[i]['CATEGORY']: rs1[i] for i in list(range(len(rs1)))} if paired: f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11] pair = rs['FIRST_OF_PAIR']['TOTAL_READS'] pair_map = rs['FIRST_OF_PAIR']['READS_ALIGNED_IN_PAIRS'] pair_map1 = rs['FIRST_OF_PAIR']['PF_READS_ALIGNED'] pair_map_hq1 = rs['FIRST_OF_PAIR']['PF_HQ_ALIGNED_READS'] pair_map2 = rs['SECOND_OF_PAIR']['PF_READS_ALIGNED'] pair_map_hq2 = rs['SECOND_OF_PAIR']['PF_HQ_ALIGNED_READS'] unpair = rs['UNPAIRED']['TOTAL_READS'] unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED'] unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS'] pair_orphan = pair_map1 + pair_map2 - pair_map * 2 pair_unmap = pair - pair_map - pair_orphan pair_map_hq = int((pair_map_hq1 + pair_map_hq2) / 2) assert pair == int(rrc), "error 1: %d %s" % (pair, rrc) assert int(rc1) + int(rc2) == unpair, "error 2" stats = map(str, [pair, pair_map, pair_orphan, pair_unmap, \ pair_map_hq, unpair, unpair_map, unpair_map_hq]) fho.write("\t".join(row + [bam] + list(stats)) + "\n") else: fr, rc, ft, rrc = row[1:5] unpair = rs['UNPAIRED']['TOTAL_READS'] unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED'] unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS'] stats = map(str, [unpair, unpair_map, unpair_map_hq]) fho.write("\t".join(row + [bam] + list(stats)) + "\n")
def process_rna_stats(path): pd = picard.parse(path) raw = pd["metrics"]["contents"] cov_sort_key = lambda x: x["normalized_position"] return { "median_3prime_bias": raw["MEDIAN_3PRIME_BIAS"], "median_5prime_bias": raw["MEDIAN_5PRIME_BIAS"], "median_5prime_to_3prime_bias": raw["MEDIAN_5PRIME_TO_3PRIME_BIAS"], "median_cv_coverage": raw["MEDIAN_CV_COVERAGE"], "num_coding_bases": raw["CODING_BASES"], "num_intergenic_bases": raw["INTERGENIC_BASES"], "num_intronic_bases": raw["INTRONIC_BASES"], "num_mrna_bases": raw["CODING_BASES"] + raw["UTR_BASES"], "num_ribosomal_bases": (raw["RIBOSOMAL_BASES"] if raw["RIBOSOMAL_BASES"] != "" else None), "num_total_bases": raw["PF_BASES"], "num_utr_bases": raw["UTR_BASES"], "pct_coding_bases": raw["PCT_CODING_BASES"], "pct_intergenic_bases": raw["PCT_INTERGENIC_BASES"], "pct_intronic_bases": raw["PCT_INTRONIC_BASES"], "pct_mrna_bases": raw["PCT_MRNA_BASES"], "pct_ribosomal_bases": (raw["RIBOSOMAL_BASES"] * 100. / raw["PF_ALIGNED_BASES"] if raw["RIBOSOMAL_BASES"] != "" else None), "pct_utr_bases": raw["PCT_UTR_BASES"], "normalized_cov": [ item["All_Reads.normalized_coverage"] for item in sorted( pd["histogram"]["contents"], key=cov_sort_key, reverse=False) ] }
def merge_picard_metrics(files, metric_name): """ piepline output picard QC metrics at sinle cell/sample level. This functin is called to merge/aggregate QC metrics by metrics type and then merge multiple QC measurement into single matrix file. In this file, column is sample/cell and row is QC metrics :param files: metric files from pipeline outputs :param met_name: metrics name with workflow name and subworkflow name as prefix. such as 'run_pipelines.RunStarPipeline.alignment_summary_metrics' """ # set up auth client = storage.Client() bucket = client.get_bucket('broad-dsde-mint-dev-cromwell-execution') # load cromwell credential logins = json.load(open('/usr/secrets/broad-dsde-mint-dev-cromwell.json')) # initial output mets = {} for kk in range(0, len(files)): fc = files[kk] fc = fc.replace('gs://broad-dsde-mint-dev-cromwell-execution/', '') blob = bucket.get_blob(fc) met_name = basename(fc) # sample name is prefix of file name sample_name = met_name.split('.')[0] with open(met_name, 'wb') as file_obj: blob.download_to_file(file_obj) # use picard package parse out picard output, a json file is returned parsed = picard.parse(met_name) class_name = parsed['metrics']['class'] # Aignment metrics return multiple lines, but only output PAIRED-READS/third line if class_name == "picard.analysis.AlignmentSummaryMetrics": ## only parse out pair reads met = parsed['metrics']['contents'][2] # sometimes(very rare), insertion metrics also return multiple lines results to include TANDEM repeats. but we only output the first line. elif class_name == "picard.analysis.InsertSizeMetrics": # if the elemnet counts is less than 21, it means insertion metrics returns multiple line results. if len(parsed['metrics']['contents']) < 21: met = parsed['metrics']['contents'][0] else: met = parsed['metrics']['contents'] else: # other metrics(so far) only return one line results. met = parsed['metrics']['contents'] mets[sample_name] = met merged = pd.DataFrame.from_dict(mets) return merged
def add(self, sample_name, input_dir, internal_sample_name): metric_line = dict() self.add_to_metric_line(metric_line, 'SAMPLE_NAME', sample_name) self.add_to_metric_line(metric_line, 'INTERNAL_NAME', internal_sample_name) flagstat_metrics = flagstat.parse(input_dir.flagstat_file()) self.add_flagstat_columns(metric_line, flagstat_metrics) alignment_metrics = picard.parse( input_dir.picard_alignment_metrics_file()) self.add_picard_alignment_columns(metric_line, alignment_metrics) dup_metrics = picard.parse( input_dir.picard_mark_duplicates_metrics_file()) self.add_picard_markdup_columns(metric_line, dup_metrics) ins_metrics = picard.parse(input_dir.picard_insert_size_metrics_file()) self.add_generic_picard_columns(metric_line, ins_metrics, 'INS') wgs_metrics = picard.parse(input_dir.picard_wgs_metrics_file()) self.add_generic_picard_columns(metric_line, wgs_metrics) gc_metrics = picard.parse(input_dir.picard_gc_bias_metrics_file()) self.add_generic_picard_columns(metric_line, gc_metrics) all_vc_metrics = picard.parse(input_dir.all_chrom_vc_detail_metrics()) self.add_generic_picard_columns(metric_line, all_vc_metrics, 'ALL') x_vc_metrics = picard.parse(input_dir.X_chrom_vc_detail_metrics()) self.add_generic_picard_columns(metric_line, x_vc_metrics, 'CHRX') verifybamid_metrics = verifybamid.parse( input_dir.verifybamid_self_sample_file()) self.add_freemix_column(metric_line, verifybamid_metrics) self.add_to_metric_line(metric_line, 'HAPLOID_COVERAGE', self.haploid_coverage(metric_line)) self.add_to_metric_line(metric_line, 'interchromosomal_rate', self.interchromosomal_rate(metric_line)) self.add_to_metric_line(metric_line, 'discordant_rate', self.discordant_rate(metric_line)) self.lines.append('\t'.join( [str(metric_line[key]) for key in self.header_order]))
def write_aggregated_picard_metrics_by_table(file_names, output_name): """Command line entrypoint to parse and write Picard table metrics. Parameters ---------- args: file_names: array of files.the basename of inputs should be formated as 'samplename_qc' output_name: prefix of output file name. the basename of outputs includes the Picard metrics class name. Returns ---------- return: 0 return if the program completes successfully. """ for file_name in file_names: cell_id = os.path.basename(file_name).split('_qc')[0] class_name = os.path.basename(file_name).split('.')[1] parsed = picard.parse(file_name) dat = pd.DataFrame.from_dict(parsed['metrics']['contents']) dat.insert(0, 'Sample', cell_id) dat.to_csv(output_name + "_" + class_name + '.csv', index=False)
def get_picard_stat(file): all_metr = picard.parse(file)['metrics']['contents'] if isinstance(all_metr, dict): return all_metr else: return all_metr[-1]