def run_hrt( feat_idx, X_drug, y_drug, elastic_model, features, ccle_features, pca_components=100, discrete_threshold=10, nbootstraps=100, nperms=5000, verbose=False, ): gene_target = ccle_features[feat_idx] feature = features.get_loc(gene_target) nunique = np.unique(X_drug[:, feature]).shape[0] if verbose: print( "{} is feature number {} with {} unique values".format( gene_target, feature, nunique ) ) fmask = np.ones(X_drug.shape[1], dtype=bool) fmask[feature] = False X_transform = X_drug[:, fmask] from sklearn.decomposition import PCA pca = PCA(n_components=pca_components) X_transform = pca.fit_transform(X_transform) X_transform = np.concatenate( [X_drug[:, feature : feature + 1], X_transform], axis=1 ) if nunique <= discrete_threshold: if verbose: print("Using discrete conditional") results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps) else: if verbose: print("Using continuous conditional") results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps) conditional = results["sampler"] tstat = lambda X_test: ((y_drug - elastic_model.predict(X_test)) ** 2).mean() p_value = hrt( feature, tstat, X_drug, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], )["p_value"] return p_value
def run_hrt( target_feature, X, y, features, model, pca_components=100, discrete_threshold=10, nbootstraps=100, nperms=5000, verbose=False, ): feature_idx = features.get_loc(target_feature) fmask = np.ones(X.shape[1], dtype=bool) fmask[feature_idx] = False X_transform = X[:, fmask] if pca_components is not None: from sklearn.decomposition import PCA pca = PCA(n_components=pca_components) X_transform = pca.fit_transform(X_transform) X_transform = np.concatenate( [X[:, feature_idx:feature_idx + 1], X_transform], axis=1) nunique = np.unique(X[:, feature_idx]).shape[0] if nunique <= discrete_threshold: if verbose: print("Using discrete conditional") results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps) else: if verbose: print("Using continuous conditional") results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps) conditional = results["sampler"] tstat = lambda X_test: ((y - model.predict(X_test))**2).mean() p_value = hrt( feature_idx, tstat, X, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], )["p_value"] return p_value
def main(): N = 500 # total number of samples P = 500 # number of features S = 40 # number of signal features nperms = 5000 nbootstraps = 100 fdr_threshold = 0.1 trial = int(sys.argv[1]) feature = int(sys.argv[2]) intervals = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) lower, upper = (50 - intervals), 50 + intervals reset_models = len(sys.argv) > 3 and "--reset-models" in sys.argv[3:] TRIAL_PATH = "data/{}".format(trial) X_PATH = "data/{}/X.csv".format(trial) Y_PATH = "data/{}/Y.csv".format(trial) TRUTH_PATH = "data/{}/truth.csv".format(trial) LINEAR_PATH = "data/{}/cv_linear.pt".format(trial) NONLINEAR_PATH = "data/{}/cv_nonlinear.pt".format(trial) P_LINEAR_PATH = "data/{}/sweep_robust_linear_p_values".format(trial) P_NONLINEAR_PATH = "data/{}/sweep_robust_nonlinear_p_values".format(trial) Pi_LINEAR_PATH = "data/{}/sweep_robust_linear_p_value_{}".format( trial, feature) Pi_NONLINEAR_PATH = "data/{}/sweep_robust_nonlinear_p_value_{}".format( trial, feature) X = np.loadtxt(X_PATH, delimiter=",") y = np.loadtxt(Y_PATH, delimiter=",") truth = np.loadtxt(TRUTH_PATH, delimiter=",") if reset_models: print("Fitting models with N={} P={} S={} nperms={}".format( N, P, S, nperms)) sys.stdout.flush() linear_model = fit_cv(X, y, verbose=False, model_type="linear") nonlinear_model = fit_cv(X, y, verbose=False, model_type="nonlinear") torch.save(linear_model, LINEAR_PATH) torch.save(nonlinear_model, NONLINEAR_PATH) else: linear_model = torch.load(LINEAR_PATH) nonlinear_model = torch.load(NONLINEAR_PATH) linear_p_values = load_or_create(P_LINEAR_PATH, P, intervals) nonlinear_p_values = load_or_create(P_NONLINEAR_PATH, P, intervals) print( "Testing with N={} P={} S={} nperms={} nbootstraps={} interval=[{},{}]" .format(N, P, S, nperms, nbootstraps, lower, upper)) # test statistics for the two models tstat_linear = lambda X_test: ( (y - linear_model.predict(X_test))**2).mean() tstat_nonlinear = lambda X_test: ( (y - nonlinear_model.predict(X_test))**2).mean() print("Feature: {}".format(feature)) conditional = None linear_p_value = linear_p_values[feature] if np.any(np.isnan(linear_p_value)) and os.path.exists(Pi_LINEAR_PATH + ".npy"): linear_p_value = np.load(Pi_LINEAR_PATH + ".npy") if np.any(np.isnan(linear_p_value)): print("Running linear robust CVR test") linear_results = hrt( feature, tstat_linear, X, nperms=nperms, nbootstraps=nbootstraps, conditional=conditional, lower=lower, upper=upper, ) # Get the results and reuse the conditional model linear_p_value = linear_results["p_value"] conditional = linear_results["sampler"] np.save(Pi_LINEAR_PATH, linear_p_value) nonlinear_p_value = nonlinear_p_values[feature] if np.any( np.isnan(nonlinear_p_value)) and os.path.exists(Pi_NONLINEAR_PATH + ".npy"): nonlinear_p_value = np.load(Pi_NONLINEAR_PATH + ".npy") if np.any(np.isnan(nonlinear_p_value)): print("Running nonlinear robust CVR test") nonlinear_results = hrt( feature, tstat_nonlinear, X, nperms=nperms, nbootstraps=nbootstraps, conditional=conditional, lower=lower, upper=upper, ) nonlinear_p_value = nonlinear_results["p_value"] np.save(Pi_NONLINEAR_PATH, nonlinear_p_value) print( "p-values Robust CVR (linear): {}\nRobust CVR (nonlinear): {}".format( pretty_str(linear_p_value), pretty_str(nonlinear_p_value))) # print('t-weights Robust CVR (linear): {}\nRobust CVR (nonlinear): {}'.format(pretty_str(linear_results['t_weights'] / linear_results['t_weights'].mean()), pretty_str(nonlinear_results['t_weights']/nonlinear_results['t_weights'].mean()))) # linear_predictions = bh_predictions(linear_p_values, fdr_threshold) # nonlinear_predictions = bh_predictions(nonlinear_p_values, fdr_threshold) # linear_tpr = tpr(truth, linear_predictions) # linear_fdr = fdr(truth, linear_predictions) # nonlinear_tpr = tpr(truth, nonlinear_predictions) # nonlinear_fdr = fdr(truth, nonlinear_predictions) # print('Robust cross-validation randomization test (linear)') # print('TPR: {:.2f}%'.format(linear_tpr*100)) # print('FDR: {:.2f}%'.format(linear_fdr*100)) # print('') # sys.stdout.flush() # print('Robust cross-validation randomization test (nonlinear)') # print('TPR: {:.2f}%'.format(nonlinear_tpr*100)) # print('FDR: {:.2f}%'.format(nonlinear_fdr*100)) # print('') # sys.stdout.flush() # if trial == 0: # with sns.axes_style('white', {'legend.frameon': True}): # plt.rc('font', weight='bold') # plt.rc('grid', lw=3) # plt.rc('lines', lw=2) # plt.rc('axes', lw=2) # plt.scatter(np.arange(P), linear_p_values, color='red', label='Linear CVR test') # plt.scatter(np.arange(P), nonlinear_p_values, color='blue', label='Non-linear CVR test') # plt.axvline(S + 0.5, ls='--', color='black') # plt.xlabel('Feature index', fontsize=18, weight='bold') # plt.ylabel('p-value', fontsize=18, weight='bold') # legend_props = {'weight': 'bold', 'size': 14} # plt.legend(loc='upper right', prop=legend_props) # plt.savefig('plots/liang-p-values-cv.pdf', bbox_inches='tight') # plt.close() # plt.scatter(linear_p_values[:S], nonlinear_p_values[:S], color='orange', label='True signals') # plt.scatter(linear_p_values[S:], nonlinear_p_values[S:], color='gray', label='True nulls') # plt.xlabel('Linear CVR p-values', fontsize=18, weight='bold') # plt.ylabel('Non-linear CVR p-values', fontsize=18, weight='bold') # plt.plot([0,1],[0,1], color='blue') # legend_props = {'weight': 'bold', 'size': 14} # plt.legend(loc='upper left', prop=legend_props) # plt.savefig('plots/liang-linear-vs-nonlinear-p-values-cv.pdf', bbox_inches='tight') # plt.close() print("Done!") sys.stdout.flush()
def run(trial, feature, reset=False): N = 500 # total number of samples P = 500 # number of features S = 40 # number of signal features nperms = 5000 fdr_threshold = 0.1 nfolds = 5 X, y, truth = load_or_create_dataset(trial, N, P, S) np.random.seed(trial * P + feature) infos = [ ModelInfo(trial, "Partial Least Squares", fit_pls, "pls"), ModelInfo(trial, "Lasso", fit_lasso_cv, "lasso"), ModelInfo(trial, "Elastic Net", fit_elastic_net_cv, "enet"), ModelInfo(trial, "Bayesian Ridge", fit_bridge, "bridge"), ModelInfo(trial, "Polynomial Kernel Ridge", fit_kridge, "kridge"), ModelInfo(trial, "RBF Support Vector", fit_svr, "svr"), ModelInfo(trial, "Random Forest", fit_forest, "rf") # ModelInfo(trial, 'Extra Trees', fit_extratrees, 'xtrees') ] folds = get_model(infos[0], X, y, create_folds(X, nfolds), reset).folds models = [get_model(info, X, y, folds, reset) for info in infos] # Create the test statistic for each model # tstats = [(lambda X_target: ((y - model.predict(X_target))**2).mean()) for model in models] # Load the conditional model for this feature conditional = get_conditional(trial, feature) # Run the normal CVRT for the first model, but save the null samples to # avoid recomputing them for the rest of the models. info, model = infos[0], models[0] tstat = lambda X_target: ((y - model.predict(X_target))**2).mean() print("Running CVRT for {}".format(info.name)) results = hrt( feature, tstat, X, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], save_nulls=True, ) p_value = results["p_value"] print("p={}".format(p_value)) np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value) # Get the relevant values from the full CVRT on the first model t_true = results["t_stat"] X_nulls = results["samples_null"] quantile_nulls = results["quantiles_null"] # Run the CVRTs for the remaining models using the same null samples X_null = np.copy(X) for info, model in zip(infos[1:], models[1:]): print("Running cached CVRT for {}".format(info.name)) t_weights = np.full(nperms, np.nan) t_null = np.full(nperms, np.nan) tstat = lambda X_target: ((y - model.predict(X_target))**2).mean() t_true = tstat(X) for perm in range(nperms): if (perm % 500) == 0: print("Trial {}".format(perm)) # Get the test-statistic under the null X_null[:, feature] = X_nulls[perm] t_null[perm] = tstat(X_null) if t_null[perm] <= t_true: # Over-estimate the likelihood t_weights[perm] = quantile_nulls[perm, 1] else: # Under-estimate the likelihood t_weights[perm] = quantile_nulls[perm, 0] p_value = t_weights[t_null <= t_true].sum() / t_weights.sum() print("p={}".format(p_value)) np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value)
def run(trial, feature, reset, cv, robust): N = 500 # total number of samples P = 500 # number of features S = 40 # number of signal features T = 100 # test sample size nperms = 5000 fdr_threshold = 0.1 model_prefix = "cv_" if cv else "" p_prefix = "cv_" if cv else "" p_prefix += "robust_" if robust else "" nbootstraps = 100 if robust else 1 LINEAR_PATH = "data/{}/{}linear.pt".format(trial, model_prefix) NONLINEAR_PATH = "data/{}/{}nonlinear.pt".format(trial, model_prefix) P_PERM_PATH = "data/{}/{}perm_p_values".format(trial, p_prefix) P_LINEAR_PATH = "data/{}/{}linear_p_values".format(trial, p_prefix) P_NONLINEAR_PATH = "data/{}/{}nonlinear_p_values".format(trial, p_prefix) Pi_PERM_PATH = "data/{}/{}perm_p_values_{}".format(trial, p_prefix, feature) Pi_LINEAR_PATH = "data/{}/{}linear_p_values_{}".format( trial, p_prefix, feature) Pi_NONLINEAR_PATH = "data/{}/{}nonlinear_p_values_{}".format( trial, p_prefix, feature) BOUNDS_LINEAR_PATH = "data/{}/{}linear_bounds_{}".format( trial, p_prefix, feature) BOUNDS_NONLINEAR_PATH = "data/{}/{}nonlinear_bounds_{}".format( trial, p_prefix, feature) CONDITIONAL_PATH = "data/{}/conditional_{}{}.pt".format( trial, p_prefix, feature) X, y, truth = load_or_create_dataset(trial, N, P, S) # Load the checkpoint if available if not reset and os.path.exists(LINEAR_PATH): linear_model = torch.load(LINEAR_PATH) nonlinear_model = torch.load(NONLINEAR_PATH) else: # Train the model print("Fitting models with N={} P={} S={} T={} nperms={}".format( N, P, S, T, nperms)) sys.stdout.flush() if cv: print("Using CV models") linear_model = fit_cv(X, y, verbose=False, model_type="linear") nonlinear_model = fit_cv(X, y, verbose=False, model_type="nonlinear") else: print("Using holdout models") linear_model = fit_nn(X[:-T], y[:-T], verbose=False, model_type="linear") nonlinear_model = fit_nn(X[:-T], y[:-T], verbose=False, model_type="nonlinear") torch.save(linear_model, LINEAR_PATH) torch.save(nonlinear_model, NONLINEAR_PATH) # Track all the p-values perm_p_values = load_or_create(P_PERM_PATH, P) if not robust else None linear_p_values = load_or_create(P_LINEAR_PATH, P) nonlinear_p_values = load_or_create(P_NONLINEAR_PATH, P) # test statistics for the two models y_train = y if cv else y[:-T] y_test = y if cv else y[-T:] X_train = X if cv else X[:-T] X_test = None if cv else X[-T:] tstat_linear = lambda X_target: ( (y_test - linear_model.predict(X_target))**2).mean() tstat_nonlinear = lambda X_target: ( (y_test - nonlinear_model.predict(X_target))**2).mean() if trial == 0: import matplotlib matplotlib.use("Agg") import seaborn as sns with sns.axes_style("white", {"legend.frameon": True}): plt.rc("font", weight="bold") plt.rc("grid", lw=3) plt.rc("lines", lw=2) plt.rc("axes", lw=2) plt.scatter(y_train, nonlinear_model.predict(X_train)) plt.plot([y.min(), y.max()], [y.min(), y.max()], color="red", ls="--") plt.xlabel("Truth", fontsize=18, weight="bold") plt.ylabel("Predicted", fontsize=18, weight="bold") plt.savefig( "plots/liang-nonlinear-fit{}.pdf".format("-cv" if cv else ""), bbox_inches="tight", ) plt.close() plt.rc("font", weight="bold") plt.rc("grid", lw=3) plt.rc("lines", lw=2) plt.rc("axes", lw=2) plt.scatter(y_train, linear_model.predict(X_train)) plt.plot([y.min(), y.max()], [y.min(), y.max()], color="red", ls="--") plt.xlabel("Truth", fontsize=18, weight="bold") plt.ylabel("Predicted", fontsize=18, weight="bold") plt.savefig( "plots/liang-linear-fit{}.pdf".format("-cv" if cv else ""), bbox_inches="tight", ) plt.close() conditional = None lower = None upper = None perm_folds = nonlinear_model.folds if cv else None print("Feature: {}".format(feature)) if not robust: print("Running permutation test") if np.isnan( perm_p_values[feature]) and not os.path.exists(Pi_PERM_PATH + ".npy"): permer = PermutationConditional(X if cv else X[-T:], feature, perm_folds) perm_p_value = hrt( feature, tstat_nonlinear, X_train, X_test=X_test, nperms=nperms, conditional=permer, )["p_value"] np.save(Pi_PERM_PATH, perm_p_value) print("Trial {} feature {} {} {} permutation p={}".format( trial, feature, "robust" if robust else "", "cv" if cv else "", perm_p_value, )) print("Running linear HRT") if np.isnan( linear_p_values[feature]) and not os.path.exists(Pi_LINEAR_PATH + ".npy"): linear_results = hrt( feature, tstat_linear, X_train, X_test=X_test, nperms=nperms, nbootstraps=nbootstraps, conditional=conditional, ) linear_p_value = linear_results["p_value"] conditional = linear_results["sampler"] np.save(Pi_LINEAR_PATH, linear_p_value) print("Trial {} feature {} {} {} linear hrt p={}".format( trial, feature, "robust" if robust else "", "cv" if cv else "", linear_p_value, )) if robust: lower = linear_results["lower"] upper = linear_results["upper"] np.save(BOUNDS_LINEAR_PATH, np.concatenate([lower, upper])) print("Running nonlinear HRT") if np.isnan(nonlinear_p_values[feature] ) and not os.path.exists(Pi_NONLINEAR_PATH + ".npy"): nonlinear_results = hrt( feature, tstat_nonlinear, X_train, X_test=X_test, nperms=nperms, nbootstraps=nbootstraps, conditional=conditional, lower=lower, upper=upper, ) nonlinear_p_value = nonlinear_results["p_value"] np.save(Pi_NONLINEAR_PATH, nonlinear_p_value) torch.save(nonlinear_results["sampler"], CONDITIONAL_PATH) print("Trial {} feature {} {} {} nonlinear hrt p={}".format( trial, feature, "robust" if robust else "", "cv" if cv else "", nonlinear_p_value, )) if robust: lower = nonlinear_results["lower"] upper = nonlinear_results["upper"] np.save(BOUNDS_NONLINEAR_PATH, np.concatenate([lower, upper])) print("") print("Done!") sys.stdout.flush()