def test_damage(ref, bam, mode, wlen, show_al, min_al, min_cov, process, verbose): """Prepare data and run Vuong's test to test for damage Args: ref (str): name of referene in alignment file bam (str): bam file mode (str): opening mode of alignment file wlen (int): window length show_al (bool): Show alignment representations min_al (int): Minimum number of aligned reads min_cov (float): Minimum coverage process (int): Number of process for parallelization verbose (bool): Run in verbose mode Returns: dict: Dictionary containing Vuong test results """ al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process) try: cov = avg_coverage(al_handle.count_coverage(contig=ref)) nb_reads_aligned = al_handle.count(contig=ref) reflen = al_handle.get_reference_length(ref) if nb_reads_aligned >= min_al or cov >= min_cov: al = al_to_damage(reference=ref, al_handle=al_handle) ct_data, ga_data, cc_data, c_data, g_data, all_bases = al.get_damage( wlen=wlen, show_al=show_al) if ct_data: model_A = models.geom_mod() model_B = models.unif_mod() test_res = fit_models(ref=ref, model_A=model_A, model_B=model_B, ct_data=ct_data, cc_data=cc_data, ga_data=ga_data, all_bases=all_bases, wlen=wlen, verbose=verbose) test_res['reference'] = ref test_res['nb_reads_aligned'] = nb_reads_aligned test_res['coverage'] = cov return(check_model_fit(test_res, wlen, verbose)) else: if verbose: print( f"Did not attempt to fit a model to {ref} because of too few reads aligned") print( f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} - reflen: {reflen}\n") pass except ValueError as e: if verbose: print( f"Model fitting for {ref} failed because of too few reads aligned") print(f"Model fitting error: {e}") print( f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} - reflen: {reflen}\n") return(False)
def test_geom_pmf(generate_data): g = models.geom_mod() assert g.pmf(x=generate_data[0], geom_p=0.5, geom_pmin=0.01, geom_pmax=0.5).all() == np.array([ 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.15471624, 0.15471624, 0.15471624, 0.15471624, 0.15471624, 0.08207436, 0.08207436, 0.04575342, 0.02759295, 0.01851272, 0.0139726, 0.01170254, 0.01056751, 0.01 ]).all()
def test_geom_log_pmf(generate_data): g = models.geom_mod() assert g.log_pmf(x=generate_data[0], geom_p=0.5, geom_pmin=0.01, geom_pmax=0.5, wlen=24).all() == np.array([ -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.2039728, -1.86616253, -1.86616253, -1.86616253, -1.86616253, -1.86616253, -2.50012956, -2.50012956, -3.08448863, -3.59019479, -3.98929721, -4.27065681, -4.44794902, -4.54997064, -4.60517019 ]).all()
def test_geom_optim(generate_data): g = models.geom_mod() o, e = optim(function=g.pmf, parameters=g.kwds, xdata=generate_data[1], ydata=generate_data[2], bounds=g.bounds, loss='linear') target = { 'geom_p': 0.6039535547658853, 'geom_pmin': 0.03637474931290336, 'geom_pmax': 0.4211432052501663 } for k in o: assert round(o[k], 3) == round(target[k], 3)
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1 ]).all() def test_unif_log_pmf(generate_data): u = models.unif_mod() assert u.log_pmf(x=generate_data[0], unif_pmin=0.1).all() == np.array([ -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509, -2.30258509 ]).all() def test_unif_optim(generate_data): u = models.unif_mod() o, e = optim(function=u.pmf, parameters=u.kwds, xdata=generate_data[1], ydata=generate_data[2], bounds=u.bounds, loss='linear') assert o == {'unif_pmin': 0.1000000000000005} if __name__ == "__main__": data, xdata, ydata = generate_data() g = models.geom_mod() o = optim(function=g.pmf, parameters=g.kwds, xdata=xdata, ydata=ydata)
def damageplot(damage_dict, wlen, outdir): """Draw pydamage plots Args: damage_dict(dict): pydamage result dictionary wlen(int): window length qlen(int): query length outdir(str): Pydamage result directory """ x = np.array(range(wlen)) qlen = np.array(range(damage_dict['qlen'])) y = np.array([damage_dict[i] for i in x]) c2t = np.array([damage_dict[f"CtoT-{i}"] for i in qlen]) g2a = np.array([damage_dict[f"GtoA-{i}"] for i in qlen]) unif_pmin = damage_dict['unif_pmin'] unif_pmin_stdev = damage_dict['unif_pmin_stdev'] geom_p = damage_dict['geom_p'] geom_pmin = damage_dict['geom_pmin'] geom_pmin_stdev = damage_dict['geom_pmin_stdev'] geom_pmax = damage_dict['geom_pmax'] geom_pmax_stdev = damage_dict['geom_pmax_stdev'] contig = damage_dict['reference'] pvalue = damage_dict['pvalue'] coverage = damage_dict['coverage'] residuals = damage_dict['residuals'] rmse = damage_dict['RMSE'] plotdir = outdir if pvalue < 0.001: rpval = "<0.001" else: rpval = f"={round(pvalue,3)}" unif = unif_mod() unif_pmin_low = max(unif.bounds[0][0], unif_pmin - 2 * unif_pmin_stdev) unif_pmin_high = min(unif.bounds[1][0], unif_pmin + 2 * unif_pmin_stdev) y_unif = unif.pmf(x, unif_pmin) y_unif_low = np.maximum(np.zeros(y_unif.shape[0]), unif.pmf(x, unif_pmin_low)) y_unif_high = np.minimum(np.ones(y_unif.shape[0]), unif.pmf(x, unif_pmin_high)) geom = geom_mod() geom_pmin_low = max(geom.bounds[0][1], geom_pmin - 2 * geom_pmin_stdev) geom_pmin_high = min(geom.bounds[1][1], geom_pmin + 2 * geom_pmin_stdev) geom_pmax_low = max(geom.bounds[0][2], geom_pmax - 2 * geom_pmax_stdev) geom_pmax_high = min(geom.bounds[1][2], geom_pmax + 2 * geom_pmax_stdev) y_geom = geom.pmf(x, geom_p, geom_pmin, geom_pmax) y_geom_low = np.maximum(np.zeros(y_geom.shape[0]), geom.pmf(x, geom_p, geom_pmin_low, geom_pmax_low)) y_geom_high = np.minimum( np.ones(y_geom.shape[0]), geom.pmf(x, geom_p, geom_pmin_high, geom_pmax_high)) plt.xticks(rotation=45, fontsize=8) fig, ax = plt.subplots() ax.plot(qlen, c2t, color='#bd0d45', alpha=0.1, label='C to T transitions') ax.plot(qlen, g2a, color='#236cf5', alpha=0.1, label='G to A transitions') ax.plot(x, y_unif, linewidth=2.5, color='DarkOliveGreen', alpha=0.8, label='Null model') ax.fill_between(x, y_unif_low, y_unif_high, color='DarkOliveGreen', alpha=0.1, label='Null Model CI (2 sigma)') ax.plot(x, y_geom, linewidth=2.5, color='#D7880F', alpha=0.8, label='Damage model') ax.fill_between(x, y_geom_low, y_geom_high, color='#D7880F', alpha=0.1, label='Damage Model CI (2 sigma)') ax.set_xlabel("Base from 5'", fontsize=10) ax.set_ylabel("Substitution frequency", fontsize=10) ax.set_xticks(qlen) ax.set_xticklabels(qlen, rotation=45, fontsize=6) ax.set_title(f"coverage: {round(coverage,2)} | pvalue{rpval}", fontsize=8) ax.legend(fontsize=8) # ax.set_title(f"coverage: {round(coverage,2)} | pvalue{rpval}", fontsize=8) left, bottom, width, height = [0.65, 0.3, 0.2, 0.2] ax2 = fig.add_axes([left, bottom, width, height]) probplot(residuals, plot=ax2, sparams=(0, np.std(c2t))) ax2.set_xlabel("Observed value", fontsize=6) ax2.set_ylabel("Theoretical quantile", fontsize=6) ax2.set_title(f"QQplot of Damage model residuals, \nRMSE={round(rmse, 3)}", fontsize=8) ax2.set_xticklabels([round(i, 3) for i in ax2.get_xticks()], fontsize=6) ax2.set_yticklabels([round(i, 3) for i in ax2.get_yticks()], fontsize=6) plt.suptitle(contig, fontsize=12, y=0.95) plt.savefig(f"{plotdir}/{contig}.png", dpi=200)