def get_z(recalculate: bool, data_logp: pd.DataFrame, data_corr: pd.DataFrame, filepath: Optional[str] = None) -> pd.DataFrame: """Get the z-score based on p-values of the correlation matrix Parameters ---------- recalculate : If True, recalculate the z-scores data_logp : The logp values data_corr : The correlation matrix of entity-entity correlations. filepath : If `recalculate==False`: read the z-score values from this file. If `recalculate==True`: write the z-score values to this file. If not provided, run the calculation and return the z-score dataframe without writing it to a file. Returns ------- : A dataframe with the z-scores """ start = time() if recalculate or filepath is None: # z_mat = stats.norm.ppf(1 - np.exp(data_logp) / 2) # z_mat = -norminv_logcdf(data_logp - np.log(2)) z_mat = abs(ndtri_exp(data_logp - np.log(2))) data_sign = data_corr.copy() data_sign[data_sign < 0] = -1 data_sign[data_sign > 0] = 1 data_z = data_sign * pd.DataFrame( z_mat, index=data_logp.columns, columns=data_logp.columns) if filepath is not None: logger.info(f'Saving z score dataframe to {"%s.h5" % filepath}') data_z.to_hdf('%s.h5' % filepath, filepath.split('/')[-1]) else: logger.info(f'Reading z-score dataframe from {filepath}') data_z = pd.read_hdf('%s.h5' % filepath) elapsed = time() - start print(elapsed, "sec") return data_z
def test_outside_domain(self): assert np.isnan(ndtri_exp(1.0))
def test_asymptotes(self): assert_equal(ndtri_exp([-np.inf, 0.0]), [-np.inf, np.inf])
def log_ndtr_ndtri_exp(y): return log_ndtr(ndtri_exp(y))
def main(): logger.info('Extracting data from explainers') expl_data = _loop_explainers(expl_dir) # Per graph type, extract what the old code has for graph_type, list_of_expl_data in expl_data.items(): if len(list_of_expl_data) == 0: logger.info(f'Skipping graph type {graph_type}') continue logger.info(f'Plotting for graph type {graph_type}') stats_norm = pd.DataFrame( columns=['range', 'filter_w_count', 'x_pos'] + labels) for data in list_of_expl_data: stats_norm = stats_norm.append(other=pd.DataFrame(data=data, index=[0]), sort=False) stats_norm.sort_values('x_pos', inplace=True) # Plot stats_norm.plot(x='x_pos', y=labels, legend=legend_labels, kind='line', marker='o', title=f'{data_title}, {graph_type.capitalize()}') ticks = [-1] + list( range(int(stats_norm.x_pos.values[1]), int(stats_norm.x_pos.max()) + 2, 2)) ticks_labels = ['RND'] + [str(n) for n in ticks[1:]] fdr_line = abs(ndtri_exp(np.log(0.05)) - np.log(2)) # <-- WRONG, fixme fdr_label = 'FDR=|ndtri_exp(ln(.05)-ln(2))|' plt.xticks(ticks=ticks, labels=ticks_labels) plt.xlabel('abs(z-score) lower bound') plt.ylabel('Pct. Corrs. Explained') plt.ylim((0, 100)) plt.axvline(x=fdr_line, ymax=0.65, color='c', label=fdr_label) plt.legend() fpath = Path(outdir).joinpath(f'{data_title}_{graph_type}.pdf') logger.info(f'Saving plot output to {fpath}') plt.savefig(fpath) if args.show_plot: plt.show() stats_norm.plot(x='x_pos', y=labels, legend=legend_labels, kind='line', marker='o', logy=True, title=f'{data_title}, ' f'{graph_type.capitalize()} (ylog)') plt.xticks(ticks=ticks, labels=ticks_labels) plt.xlabel('abs(z-score) lower bound') plt.ylabel('Pct. Corrs. Explained') plt.ylim((10**-2, 10**2)) plt.axvline(x=fdr_line, ymin=0.35, color='c', label=fdr_label) plt.legend() plt.savefig( Path(outdir).joinpath(f'{data_title}_{graph_type}_ylog.pdf')) if args.show_plot: plt.show()