def run_platformqc(data_path, output_path, *, suffix=None, b_width=1000): if not suffix: suffix = "" else: suffix = "_" + suffix log_path = os.path.join(output_path, "log", "log_rs2_platformqc" + suffix + ".txt") fig_path = os.path.join(output_path, "fig", "fig_rs2_platformqc_length" + suffix + ".png") fig_path2 = os.path.join(output_path, "fig", "fig_rs2_platformqc_score" + suffix + ".png") json_path = os.path.join(output_path, "QC_vals_rs" + suffix + ".json") # json tobe_json = {} # output_path will be made too. if not os.path.isdir(os.path.join(output_path, "log")): os.makedirs(os.path.join(output_path, "log"), exist_ok=True) if not os.path.isdir(os.path.join(output_path, "fig")): os.makedirs(os.path.join(output_path, "fig"), exist_ok=True) ### logging conf ### logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) fh = logging.FileHandler(log_path, 'w') sh = logging.StreamHandler() formatter = logging.Formatter( '%(module)s:%(asctime)s:%(lineno)d:%(levelname)s:%(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) ##################### logger.info("Started RS-II platform QC for %s" % data_path) xml_file = get_sts_xml_path(data_path, logger) if not xml_file: logger.warning("sts.xml is missing. Productivity won't be shown") [p0, p1, p2] = [None] * 3 else: [p0, p1, p2] = parse_sts_xml( xml_file, ns="http://pacificbiosciences.com/PipelineStats/PipeStats.xsd") logger.info("Parsed sts.xml") csv_path = get_sts_csv_path(data_path, logger) if not csv_path: logger.ERROR("Platform QC failed due to missing csv files") return 1 df = load_sts_csv(csv_path) logger.info("Stat file was loaded.") vals = df['HQRegionEnd'].values[df['ReadScore'] > 0.1] - df[ 'HQRegionStart'].values[df['ReadScore'] > 0.1] (a, b) = lq_gamma.estimate_gamma_dist_scipy(vals, logger) logger.info("Fitting by Gamma dist finished.") _max = np.array(vals).max() _mean = np.array(vals).mean() _n50 = get_N50(vals) _n90 = get_NXX(vals, 90) throughput = np.sum(vals) ### HQ fraction over numbases fracs = vals / df['NumBases'].values[df['ReadScore'] > 0.1] #plt.hist(vals, histtype='bar', bins=np.arange(0.0, 1.0 + 0.02, 0.02), color='blue') #plt.show() tobe_json["Productivity"] = {"P0": p0, "P1": p1, "P2": p2} tobe_json["Throughput"] = int(throughput) tobe_json["Longest_read"] = int(_max) tobe_json["Num_of_reads"] = len(vals) tobe_json["polread_gamma_params"] = [float(a), float(b)] tobe_json["Mean_polread_length"] = float(_mean) tobe_json["N50_polread_length"] = float(_n50) tobe_json["Mean_HQ_fraction"] = float(np.mean(fracs)) with open(json_path, "w") as f: logger.info("Quality measurements were written into a JSON file: %s" % json_path) json.dump(tobe_json, f, indent=4) x = np.linspace(0, gamma.ppf(0.99, a, 0, b)) est_dist = gamma(a, 0, b) plt.plot(x, est_dist.pdf(x), c=rgb(214, 39, 40)) plt.grid(True) plt.hist(vals, histtype='step', bins=np.arange(min(vals), _max + b_width, b_width), color=rgb(214, 39, 40), alpha=0.7, density=True) plt.xlabel('Read length') plt.ylabel('Probability density') if _mean >= 10000: # pol read mean is expected >= 10k and <= 15k, but omit the <= 15k condition. plt.axvline(x=_mean, linestyle='dashed', linewidth=2, color=rgb(44, 160, 44), alpha=0.8) else: plt.axvline(x=_mean, linestyle='dashed', linewidth=2, color=rgb(188, 189, 34), alpha=0.8) if _n50 >= 20000: # recent brochure says 20kb, but some old announcement says 14kb. let's see plt.axvline(x=_n50, linewidth=2, color=rgb(44, 160, 44), alpha=0.8) else: plt.axvline(x=_n50, linewidth=2, color=rgb(188, 189, 34), alpha=0.8) vals = df['NumBases'].values[df['ReadScore'] > 0.1] plt.hist(vals, histtype='step', bins=np.arange(min(vals), max(vals) + b_width, b_width), color=rgb(31, 119, 180), alpha=0.7, density=True) ymin, ymax = plt.gca().get_ylim() xmin, xmax = plt.gca().get_xlim() plt.text(xmax * 0.6, ymax * 0.72, r'$\alpha=%.3f,\ \beta=%.3f$' % (a, b)) plt.text(xmax * 0.6, ymax * 0.77, r'Gamma dist params:') plt.text(xmax * 0.6, ymax * 0.85, r'sample mean: %.3f' % (_mean, )) plt.text(xmax * 0.6, ymax * 0.9, r'N50: %.3f' % (_n50, )) plt.text(xmax * 0.6, ymax * 0.95, r'N90: %.3f' % (_n90, )) plt.text(_mean, ymax * 0.85, r'Mean') plt.text(_n50, ymax * 0.9, r'N50') plt.savefig(fig_path, bbox_inches="tight") plt.close() #plt.show() ### read score after size binning. subplot = gen_boxplot_length_vs_score(df, b_width) xmin, xmax = plt.gca().get_xlim() plt.title("Read scores over different length reads") plt.xticks(np.arange(xmax + 1), [int(i) for i in np.arange(xmax + 1) * b_width]) plt.suptitle("") plt.savefig(fig_path2, bbox_inches="tight") #plt.show() plt.close() logger.info("Figs were generated.") logger.info("Finished all processes.")
def run_platformqc(data_path, output_path, *, suffix=None, b_width=1000): if not suffix: suffix = "" else: suffix = "_" + suffix log_path = os.path.join(output_path, "log", "log_sequel_platformqc" + suffix + ".txt") fig_path = os.path.join(output_path, "fig", "fig_sequel_platformqc_length" + suffix + ".png") fig_path_bar = os.path.join( output_path, "fig", "fig_sequel_platformqc_adapter" + suffix + ".png") json_path = os.path.join(output_path, "QC_vals_sequel" + suffix + ".json") # json tobe_json = {} # output_path will be made too. if not os.path.isdir(os.path.join(output_path, "log")): os.makedirs(os.path.join(output_path, "log"), exist_ok=True) if not os.path.isdir(os.path.join(output_path, "fig")): os.makedirs(os.path.join(output_path, "fig"), exist_ok=True) ### logging conf ### logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) fh = logging.FileHandler(log_path, 'w') sh = logging.StreamHandler() formatter = logging.Formatter( '%(module)s:%(asctime)s:%(lineno)d:%(levelname)s:%(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) ##################### logger.info("Started sequel platform QC for %s" % data_path) # sequel xml_file = get_sts_xml_path(data_path, logger) if not xml_file: logger.warning("sts.xml is missing. Productivity won't be shown") [p0, p1, p2] = [None] * 3 else: [p0, p1, p2] = parse_sts_xml( xml_file, ns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd") logger.info("Parsed sts.xml") [subr_bam_p, scrap_bam_p] = get_bam_path(data_path, logger) if subr_bam_p and scrap_bam_p: scrap_bam = pysam.AlignmentFile(scrap_bam_p, 'rb', check_sq=False) subr_bam = pysam.AlignmentFile(subr_bam_p, 'rb', check_sq=False) else: logger.ERROR("Platform QC failed due to missing bam files") return 1 bam_reads = {} snr = [[], [], [], []] hr_fraction = [] tot_lengths = [] hr_lengths = [] ad_num_stat = {} control_throughput = 0 if get_readtype(scrap_bam.header) == 'SCRAP': logger.info("Started to load scraps.bam...") control_throughput = set_scrap(bam_reads, scrap_bam, snr) else: logger.ERROR("the given scrap file has incorrect header.") logger.info("Scrap reads were loaded.") if get_readtype(subr_bam.header) == 'SUBREAD': logger.info("Started to load subreads.bam...") set_subreads(bam_reads, subr_bam, snr) else: logger.ERROR("the given subread file has incorrect header.") logger.info("Subreads were loaded.") for k, v in bam_reads.items(): #print(k) l = construct_polread(v) #print(l) if l[4]: hr_fraction.append(l[2] / l[3]) tot_lengths.append(l[3]) hr_lengths.append(l[2]) if l[5] in ad_num_stat: ad_num_stat[l[5]] += 1 else: ad_num_stat[l[5]] = 1 max_adnum = max(ad_num_stat.keys()) min_adnum = min(ad_num_stat.keys()) left = [] height = [] for i in range(min_adnum, max_adnum + 1): left.append(i) if i in ad_num_stat: height.append(ad_num_stat[i]) else: height.append(0) plt.bar(left, height) plt.savefig(fig_path_bar, bbox_inches="tight") plt.close() logger.info("Plotted bar plot for adpter occurence") (a, b) = lq_gamma.estimate_gamma_dist_scipy(hr_lengths) logger.info("Fitting by Gamma dist finished.") _max = np.array(hr_lengths).max() _mean = np.array(hr_lengths).mean() _n50 = get_N50(hr_lengths) _n90 = get_NXX(hr_lengths, 90) throughput = np.sum(hr_lengths) longest = np.max(hr_lengths) fracs = np.mean(hr_fraction) tobe_json["Productivity"] = {"P0": p0, "P1": p1, "P2": p2} tobe_json["Throughput"] = int(throughput) tobe_json["Throughput(Control)"] = int(control_throughput) tobe_json["Longest_read"] = int(_max) tobe_json["Num_of_reads"] = len(hr_lengths) tobe_json["polread_gamma_params"] = [float(a), float(b)] tobe_json["Mean_polread_length"] = float(_mean) tobe_json["N50_polread_length"] = float(_n50) tobe_json["Mean_HQ_fraction"] = float(np.mean(fracs)) tobe_json["Adapter_observation"] = ad_num_stat with open(json_path, "w") as f: logger.info("Quality measurements were written into a JSON file: %s" % json_path) json.dump(tobe_json, f, indent=4) x = np.linspace(0, gamma.ppf(0.99, a, 0, b)) est_dist = gamma(a, 0, b) plt.plot(x, est_dist.pdf(x), c=rgb(214, 39, 40)) plt.grid(True) plt.hist(hr_lengths, histtype='step', bins=np.arange(min(hr_lengths), _max + b_width, b_width), color=rgb(214, 39, 40), alpha=0.7, normed=True) plt.xlabel('Read length') plt.ylabel('Probability density') if _mean >= 10000: # pol read mean is expected >= 10k and <= 15k, but omit the <= 15k condition. plt.axvline(x=_mean, linestyle='dashed', linewidth=2, color=rgb(44, 160, 44), alpha=0.8) else: plt.axvline(x=_mean, linestyle='dashed', linewidth=2, color=rgb(188, 189, 34), alpha=0.8) if _n50 >= 20000: plt.axvline(x=_n50, linewidth=2, color=rgb(44, 160, 44), alpha=0.8) else: plt.axvline(x=_n50, linewidth=2, color=rgb(188, 189, 34), alpha=0.8) plt.hist(tot_lengths, histtype='step', bins=np.arange(min(tot_lengths), max(tot_lengths) + b_width, b_width), color=rgb(31, 119, 180), alpha=0.7, normed=True) ymin, ymax = plt.gca().get_ylim() xmin, xmax = plt.gca().get_xlim() plt.text(xmax * 0.6, ymax * 0.72, r'$\alpha=%.3f,\ \beta=%.3f$' % (a, b)) plt.text(xmax * 0.6, ymax * 0.77, r'Gamma dist params:') plt.text(xmax * 0.6, ymax * 0.85, r'sample mean: %.3f' % (_mean, )) plt.text(xmax * 0.6, ymax * 0.9, r'N50: %.3f' % (_n50, )) plt.text(xmax * 0.6, ymax * 0.95, r'N90: %.3f' % (_n90, )) plt.text(_mean, ymax * 0.85, r'Mean') plt.text(_n50, ymax * 0.9, r'N50') plt.savefig(fig_path, bbox_inches="tight") plt.close() #plt.show() logger.info("Figs were generated.") logger.info("Finished all processes.")