Beispiel #1
0
def run_platformqc(data_path, output_path, *, suffix=None, b_width=1000):
    if not suffix:
        suffix = ""
    else:
        suffix = "_" + suffix
    log_path = os.path.join(output_path, "log",
                            "log_rs2_platformqc" + suffix + ".txt")
    fig_path = os.path.join(output_path, "fig",
                            "fig_rs2_platformqc_length" + suffix + ".png")
    fig_path2 = os.path.join(output_path, "fig",
                             "fig_rs2_platformqc_score" + suffix + ".png")
    json_path = os.path.join(output_path, "QC_vals_rs" + suffix + ".json")

    # json
    tobe_json = {}

    # output_path will be made too.
    if not os.path.isdir(os.path.join(output_path, "log")):
        os.makedirs(os.path.join(output_path, "log"), exist_ok=True)

    if not os.path.isdir(os.path.join(output_path, "fig")):
        os.makedirs(os.path.join(output_path, "fig"), exist_ok=True)

    ### logging conf ###
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler(log_path, 'w')
    sh = logging.StreamHandler()

    formatter = logging.Formatter(
        '%(module)s:%(asctime)s:%(lineno)d:%(levelname)s:%(message)s')
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)

    logger.addHandler(sh)
    logger.addHandler(fh)
    #####################

    logger.info("Started RS-II platform QC for %s" % data_path)

    xml_file = get_sts_xml_path(data_path, logger)

    if not xml_file:
        logger.warning("sts.xml is missing. Productivity won't be shown")
        [p0, p1, p2] = [None] * 3
    else:
        [p0, p1, p2] = parse_sts_xml(
            xml_file,
            ns="http://pacificbiosciences.com/PipelineStats/PipeStats.xsd")
        logger.info("Parsed sts.xml")

    csv_path = get_sts_csv_path(data_path, logger)
    if not csv_path:
        logger.ERROR("Platform QC failed due to missing csv files")
        return 1

    df = load_sts_csv(csv_path)
    logger.info("Stat file was loaded.")

    vals = df['HQRegionEnd'].values[df['ReadScore'] > 0.1] - df[
        'HQRegionStart'].values[df['ReadScore'] > 0.1]

    (a, b) = lq_gamma.estimate_gamma_dist_scipy(vals, logger)
    logger.info("Fitting by Gamma dist finished.")
    _max = np.array(vals).max()
    _mean = np.array(vals).mean()
    _n50 = get_N50(vals)
    _n90 = get_NXX(vals, 90)
    throughput = np.sum(vals)

    ### HQ fraction over numbases
    fracs = vals / df['NumBases'].values[df['ReadScore'] > 0.1]
    #plt.hist(vals, histtype='bar', bins=np.arange(0.0, 1.0 + 0.02, 0.02), color='blue')
    #plt.show()

    tobe_json["Productivity"] = {"P0": p0, "P1": p1, "P2": p2}
    tobe_json["Throughput"] = int(throughput)
    tobe_json["Longest_read"] = int(_max)
    tobe_json["Num_of_reads"] = len(vals)
    tobe_json["polread_gamma_params"] = [float(a), float(b)]
    tobe_json["Mean_polread_length"] = float(_mean)
    tobe_json["N50_polread_length"] = float(_n50)
    tobe_json["Mean_HQ_fraction"] = float(np.mean(fracs))

    with open(json_path, "w") as f:
        logger.info("Quality measurements were written into a JSON file: %s" %
                    json_path)
        json.dump(tobe_json, f, indent=4)

    x = np.linspace(0, gamma.ppf(0.99, a, 0, b))
    est_dist = gamma(a, 0, b)
    plt.plot(x, est_dist.pdf(x), c=rgb(214, 39, 40))
    plt.grid(True)
    plt.hist(vals,
             histtype='step',
             bins=np.arange(min(vals), _max + b_width, b_width),
             color=rgb(214, 39, 40),
             alpha=0.7,
             density=True)
    plt.xlabel('Read length')
    plt.ylabel('Probability density')

    if _mean >= 10000:  # pol read mean is expected >= 10k and <= 15k, but omit the <= 15k condition.
        plt.axvline(x=_mean,
                    linestyle='dashed',
                    linewidth=2,
                    color=rgb(44, 160, 44),
                    alpha=0.8)
    else:
        plt.axvline(x=_mean,
                    linestyle='dashed',
                    linewidth=2,
                    color=rgb(188, 189, 34),
                    alpha=0.8)

    if _n50 >= 20000:  # recent brochure says 20kb, but some old announcement says 14kb. let's see
        plt.axvline(x=_n50, linewidth=2, color=rgb(44, 160, 44), alpha=0.8)
    else:
        plt.axvline(x=_n50, linewidth=2, color=rgb(188, 189, 34), alpha=0.8)

    vals = df['NumBases'].values[df['ReadScore'] > 0.1]
    plt.hist(vals,
             histtype='step',
             bins=np.arange(min(vals),
                            max(vals) + b_width, b_width),
             color=rgb(31, 119, 180),
             alpha=0.7,
             density=True)

    ymin, ymax = plt.gca().get_ylim()
    xmin, xmax = plt.gca().get_xlim()
    plt.text(xmax * 0.6, ymax * 0.72, r'$\alpha=%.3f,\ \beta=%.3f$' % (a, b))
    plt.text(xmax * 0.6, ymax * 0.77, r'Gamma dist params:')

    plt.text(xmax * 0.6, ymax * 0.85, r'sample mean: %.3f' % (_mean, ))
    plt.text(xmax * 0.6, ymax * 0.9, r'N50: %.3f' % (_n50, ))
    plt.text(xmax * 0.6, ymax * 0.95, r'N90: %.3f' % (_n90, ))

    plt.text(_mean, ymax * 0.85, r'Mean')
    plt.text(_n50, ymax * 0.9, r'N50')

    plt.savefig(fig_path, bbox_inches="tight")
    plt.close()
    #plt.show()

    ### read score after size binning.
    subplot = gen_boxplot_length_vs_score(df, b_width)
    xmin, xmax = plt.gca().get_xlim()
    plt.title("Read scores over different length reads")
    plt.xticks(np.arange(xmax + 1),
               [int(i) for i in np.arange(xmax + 1) * b_width])
    plt.suptitle("")
    plt.savefig(fig_path2, bbox_inches="tight")
    #plt.show()
    plt.close()

    logger.info("Figs were generated.")
    logger.info("Finished all processes.")
Beispiel #2
0
def run_platformqc(data_path, output_path, *, suffix=None, b_width=1000):
    if not suffix:
        suffix = ""
    else:
        suffix = "_" + suffix
    log_path = os.path.join(output_path, "log",
                            "log_sequel_platformqc" + suffix + ".txt")
    fig_path = os.path.join(output_path, "fig",
                            "fig_sequel_platformqc_length" + suffix + ".png")
    fig_path_bar = os.path.join(
        output_path, "fig", "fig_sequel_platformqc_adapter" + suffix + ".png")
    json_path = os.path.join(output_path, "QC_vals_sequel" + suffix + ".json")
    # json
    tobe_json = {}

    # output_path will be made too.
    if not os.path.isdir(os.path.join(output_path, "log")):
        os.makedirs(os.path.join(output_path, "log"), exist_ok=True)

    if not os.path.isdir(os.path.join(output_path, "fig")):
        os.makedirs(os.path.join(output_path, "fig"), exist_ok=True)

    ### logging conf ###
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler(log_path, 'w')
    sh = logging.StreamHandler()

    formatter = logging.Formatter(
        '%(module)s:%(asctime)s:%(lineno)d:%(levelname)s:%(message)s')
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)

    logger.addHandler(sh)
    logger.addHandler(fh)
    #####################

    logger.info("Started sequel platform QC for %s" % data_path)

    # sequel
    xml_file = get_sts_xml_path(data_path, logger)

    if not xml_file:
        logger.warning("sts.xml is missing. Productivity won't be shown")
        [p0, p1, p2] = [None] * 3
    else:
        [p0, p1, p2] = parse_sts_xml(
            xml_file,
            ns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd")
        logger.info("Parsed sts.xml")

    [subr_bam_p, scrap_bam_p] = get_bam_path(data_path, logger)
    if subr_bam_p and scrap_bam_p:
        scrap_bam = pysam.AlignmentFile(scrap_bam_p, 'rb', check_sq=False)
        subr_bam = pysam.AlignmentFile(subr_bam_p, 'rb', check_sq=False)
    else:
        logger.ERROR("Platform QC failed due to missing bam files")
        return 1

    bam_reads = {}
    snr = [[], [], [], []]
    hr_fraction = []
    tot_lengths = []
    hr_lengths = []
    ad_num_stat = {}
    control_throughput = 0

    if get_readtype(scrap_bam.header) == 'SCRAP':
        logger.info("Started to load scraps.bam...")
        control_throughput = set_scrap(bam_reads, scrap_bam, snr)
    else:
        logger.ERROR("the given scrap file has incorrect header.")

    logger.info("Scrap reads were loaded.")

    if get_readtype(subr_bam.header) == 'SUBREAD':
        logger.info("Started to load subreads.bam...")
        set_subreads(bam_reads, subr_bam, snr)
    else:
        logger.ERROR("the given subread file has incorrect header.")

    logger.info("Subreads were loaded.")

    for k, v in bam_reads.items():
        #print(k)
        l = construct_polread(v)

        #print(l)
        if l[4]:
            hr_fraction.append(l[2] / l[3])
            tot_lengths.append(l[3])
            hr_lengths.append(l[2])
            if l[5] in ad_num_stat:
                ad_num_stat[l[5]] += 1
            else:
                ad_num_stat[l[5]] = 1

    max_adnum = max(ad_num_stat.keys())
    min_adnum = min(ad_num_stat.keys())

    left = []
    height = []
    for i in range(min_adnum, max_adnum + 1):
        left.append(i)
        if i in ad_num_stat:
            height.append(ad_num_stat[i])
        else:
            height.append(0)

    plt.bar(left, height)
    plt.savefig(fig_path_bar, bbox_inches="tight")
    plt.close()
    logger.info("Plotted bar plot for adpter occurence")

    (a, b) = lq_gamma.estimate_gamma_dist_scipy(hr_lengths)
    logger.info("Fitting by Gamma dist finished.")

    _max = np.array(hr_lengths).max()
    _mean = np.array(hr_lengths).mean()
    _n50 = get_N50(hr_lengths)
    _n90 = get_NXX(hr_lengths, 90)
    throughput = np.sum(hr_lengths)
    longest = np.max(hr_lengths)
    fracs = np.mean(hr_fraction)

    tobe_json["Productivity"] = {"P0": p0, "P1": p1, "P2": p2}
    tobe_json["Throughput"] = int(throughput)
    tobe_json["Throughput(Control)"] = int(control_throughput)
    tobe_json["Longest_read"] = int(_max)
    tobe_json["Num_of_reads"] = len(hr_lengths)
    tobe_json["polread_gamma_params"] = [float(a), float(b)]
    tobe_json["Mean_polread_length"] = float(_mean)
    tobe_json["N50_polread_length"] = float(_n50)
    tobe_json["Mean_HQ_fraction"] = float(np.mean(fracs))
    tobe_json["Adapter_observation"] = ad_num_stat

    with open(json_path, "w") as f:
        logger.info("Quality measurements were written into a JSON file: %s" %
                    json_path)
        json.dump(tobe_json, f, indent=4)

    x = np.linspace(0, gamma.ppf(0.99, a, 0, b))
    est_dist = gamma(a, 0, b)
    plt.plot(x, est_dist.pdf(x), c=rgb(214, 39, 40))
    plt.grid(True)
    plt.hist(hr_lengths,
             histtype='step',
             bins=np.arange(min(hr_lengths), _max + b_width, b_width),
             color=rgb(214, 39, 40),
             alpha=0.7,
             normed=True)
    plt.xlabel('Read length')
    plt.ylabel('Probability density')

    if _mean >= 10000:  # pol read mean is expected >= 10k and <= 15k, but omit the <= 15k condition.
        plt.axvline(x=_mean,
                    linestyle='dashed',
                    linewidth=2,
                    color=rgb(44, 160, 44),
                    alpha=0.8)
    else:
        plt.axvline(x=_mean,
                    linestyle='dashed',
                    linewidth=2,
                    color=rgb(188, 189, 34),
                    alpha=0.8)

    if _n50 >= 20000:
        plt.axvline(x=_n50, linewidth=2, color=rgb(44, 160, 44), alpha=0.8)
    else:
        plt.axvline(x=_n50, linewidth=2, color=rgb(188, 189, 34), alpha=0.8)

    plt.hist(tot_lengths,
             histtype='step',
             bins=np.arange(min(tot_lengths),
                            max(tot_lengths) + b_width, b_width),
             color=rgb(31, 119, 180),
             alpha=0.7,
             normed=True)

    ymin, ymax = plt.gca().get_ylim()
    xmin, xmax = plt.gca().get_xlim()
    plt.text(xmax * 0.6, ymax * 0.72, r'$\alpha=%.3f,\ \beta=%.3f$' % (a, b))
    plt.text(xmax * 0.6, ymax * 0.77, r'Gamma dist params:')

    plt.text(xmax * 0.6, ymax * 0.85, r'sample mean: %.3f' % (_mean, ))
    plt.text(xmax * 0.6, ymax * 0.9, r'N50: %.3f' % (_n50, ))
    plt.text(xmax * 0.6, ymax * 0.95, r'N90: %.3f' % (_n90, ))

    plt.text(_mean, ymax * 0.85, r'Mean')
    plt.text(_n50, ymax * 0.9, r'N50')

    plt.savefig(fig_path, bbox_inches="tight")
    plt.close()
    #plt.show()

    logger.info("Figs were generated.")
    logger.info("Finished all processes.")