def p_tpr_fdr(p_values, intervals, truth, fdr_threshold=0.1): interval_tpr = np.full((intervals.shape[0], intervals.shape[0]), np.nan) interval_fdr = np.full(interval_tpr.shape, np.nan) for idx1, interval in enumerate(intervals): for idx2, interval in enumerate(intervals): p = p_values[:, idx1, idx2] if np.any(np.isnan(p)): continue p_indices = ~np.isnan(p) p = p[p_indices] t = truth[p_indices] d = bh_predictions(p, fdr_threshold) interval_tpr[idx1, idx2] = tpr(t, d) interval_fdr[idx1, idx2] = fdr(t, d) return interval_tpr, interval_fdr
importance = rf_importance(model.models) else: importance = linear_model_importance(model.models) # Get the p-values and add correction term all_p_filename = "data/{}/{}.npy".format(trial, info.prefix) p_values[info.name][trial] = np.load(all_p_filename) p_values[info.name][trial] = (p_values[info.name][trial] * nperms + 1) / (nperms + 1) # Run the filtered testing procedure important = importance >= importance_threshold p_important = p_values[info.name][trial][important] print("\tFiltering down to {} features".format(important.sum())) pred = np.zeros(P, dtype=int) pred[important] = bh_predictions(p_important, fdr_threshold) tpr_vals[info.name][trial] = tpr(truth, pred) fdr_vals[info.name][trial] = fdr(truth, pred) labels = [info.name for info in infos] print("Plotting power and FDR results") results_plot( [tpr_vals[label] for label in labels], [fdr_vals[label] for label in labels], [label.replace(" ", "\n") for label in labels], fdr_threshold, ) plt.savefig("plots/predictors-importance.pdf", bbox_inches="tight") plt.close()
def add_if_finished(trial, all_p_values, all_tpr, all_fdr, truth, fdr_threshold): if not np.any(np.isnan(all_p_values[trial])): predictions = bh_predictions(all_p_values[trial], fdr_threshold) all_tpr[trial] = tpr(truth, predictions) all_fdr[trial] = fdr(truth, predictions)
p_filename ): p = np.load(p_filename) p_values[info.name][trial, feature] = p np.save(all_p_filename, p_values[info.name][trial]) p_values[info.name][trial] = (p_values[info.name][trial] * nperms + 1) / ( nperms + 1 ) missing = np.isnan(p_values[info.name][trial]) signal_p_values[info.name].extend( p_values[info.name][trial, :S][~missing[:S]] ) p_trial = np.ones(P) p_trial[~missing] = p_values[info.name][trial][~missing] pred = bh_predictions(p_trial, fdr_threshold) tpr_vals[info.name][trial] = tpr(truth, pred) fdr_vals[info.name][trial] = fdr(truth, pred) if np.any(missing): if missing.sum() > 10: print("Total missing: {}".format(missing.sum())) else: print("Missing: {}".format(np.arange(P)[missing])) # Load the p-values for the other models for path, name in [ ("cv_robust_nonlinear_p_values", "Neural Net"), ("cv_robust_linear_p_values", "OLS"), ]: p_trial = np.load("data/{}/{}.npy".format(trial, path)) p_trial = (p_trial * nperms + 1) / (nperms + 1)
def main(): N = 500 # total number of samples P = 500 # number of features S = 40 # number of signal features nperms = 5000 nbootstraps = 100 fdr_threshold = 0.1 ntrials = 100 names = ['Holdout\nPermutation', 'Calibrated\nHRT\n(linear)', 'Uncalibrated\nHRT', 'CV\nPermutation', 'Calibrated\nCV-HRT\n(linear)', 'Uncalibrated\nCV-HRT', 'Calibrated\nHRT\n(linear)', 'Calibrated\nHRT', 'Calibrated\nCV-HRT\n(linear)', 'Calibrated\nCV-HRT'] prefixes = ['perm', 'linear', 'nonlinear', 'cv_perm', 'cv_linear', 'cv_nonlinear', 'robust_linear', 'robust_nonlinear', 'cv_robust_linear', 'cv_robust_nonlinear'] p_values = np.full((len(prefixes), ntrials, P), np.nan) tpr_vals, fdr_vals = np.full((len(prefixes), ntrials), np.nan), np.full((len(prefixes), ntrials), np.nan) for idx, prefix in enumerate(prefixes): for trial in range(ntrials): if (trial % 25) == 0: print('{} trial: {}'.format(prefix, trial)) TRUTH_PATH = 'data/{}/truth.csv'.format(trial) truth = np.loadtxt(TRUTH_PATH, delimiter=',') P_VALUE_PATH = 'data/{}/{}_p_values.npy'.format(trial, prefix) if os.path.exists(P_VALUE_PATH): p_values[idx, trial] = np.load(P_VALUE_PATH) clean_up_needed = False if np.any(np.isnan(p_values[idx,trial])): clean_up_needed = True for feature in range(P): Pi_PATH = 'data/{}/{}_p_values_{}.npy'.format(trial, prefix, feature) if np.isnan(p_values[idx, trial, feature]) and os.path.exists(Pi_PATH): try: p_values[idx,trial,feature] = np.load(Pi_PATH) except: os.remove(Pi_PATH) # p_values[idx, trial] = p_values[idx, trial] * nperms / (nperms+1) missing = np.isnan(p_values[idx, trial]) pred = bh_predictions(p_values[idx, trial][~missing], fdr_threshold) tpr_vals[idx, trial] = tpr(truth[~missing], pred) fdr_vals[idx, trial] = fdr(truth[~missing], pred) if not np.any(np.isnan(p_values[idx,trial])): # clean up if clean_up_needed: np.save(P_VALUE_PATH, p_values[idx,trial]) for feature in range(P): Pi_PATH = 'data/{}/{}_p_values_{}.npy'.format(trial, prefix, feature) if os.path.exists(Pi_PATH): # print('Would delete {}'.format((idx, trial, feature))) os.remove(Pi_PATH) else: print('Trial {} Nulls: {}'.format(trial, np.where(np.isnan(p_values[idx, trial]))[0])) if 'robust' in prefix: # Get the distribution of confidence intervals bounds = np.full((ntrials, P, 2), np.nan) for trial in range(ntrials): BOUNDS_PATH = 'data/{}/{}_bounds.npy'.format(trial, prefix) if os.path.exists(BOUNDS_PATH): bounds[trial] = np.load(BOUNDS_PATH) clean_up_needed = False if np.any(np.isnan(bounds[trial])): clean_up_needed = True for feature in range(P): BOUNDS_i_PATH = 'data/{}/{}_bounds_{}.npy'.format(trial, prefix, feature) if np.any(np.isnan(bounds[trial, feature])) and os.path.exists(BOUNDS_i_PATH): bounds[trial,feature] = np.load(BOUNDS_i_PATH) if not np.any(np.isnan(bounds[trial])): # clean up if clean_up_needed: np.save(BOUNDS_PATH, bounds[trial]) for feature in range(P): BOUNDS_i_PATH = 'data/{}/{}_bounds_{}.npy'.format(trial, prefix, feature) if os.path.exists(BOUNDS_i_PATH): # print('Would delete {}'.format(BOUNDS_i_PATH)) os.remove(BOUNDS_i_PATH) bounds_plot(bounds) plt.savefig('plots/liang-bounds-{}.pdf'.format(prefix.replace('_', '-')), bbox_inches='tight') print('*** {} model ({} trials) ***'.format(names[idx], (~np.isnan(tpr_vals[idx])).sum())) print('TPR: {:.2f}%'.format(np.nanmean(tpr_vals[idx], axis=0)*100)) print('FDR: {:.2f}%'.format(np.nanmean(fdr_vals[idx], axis=0)*100)) print('') p_plot(p_values[idx], S) plt.savefig('plots/liang-p-values-{}.pdf'.format(prefix.replace('_','-')), bbox_inches='tight') selected = np.array([0,3,2,5,7,9]) results_plot([tpr_vals[i] for i in selected], [fdr_vals[i] for i in selected], [names[i] for i in selected], fdr_threshold) plt.savefig('plots/liang-tpr-fdr.pdf', bbox_inches='tight')