def run_hrt( feat_idx, X_drug, y_drug, elastic_model, features, ccle_features, pca_components=100, discrete_threshold=10, nbootstraps=100, nperms=5000, verbose=False, ): gene_target = ccle_features[feat_idx] feature = features.get_loc(gene_target) nunique = np.unique(X_drug[:, feature]).shape[0] if verbose: print( "{} is feature number {} with {} unique values".format( gene_target, feature, nunique ) ) fmask = np.ones(X_drug.shape[1], dtype=bool) fmask[feature] = False X_transform = X_drug[:, fmask] from sklearn.decomposition import PCA pca = PCA(n_components=pca_components) X_transform = pca.fit_transform(X_transform) X_transform = np.concatenate( [X_drug[:, feature : feature + 1], X_transform], axis=1 ) if nunique <= discrete_threshold: if verbose: print("Using discrete conditional") results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps) else: if verbose: print("Using continuous conditional") results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps) conditional = results["sampler"] tstat = lambda X_test: ((y_drug - elastic_model.predict(X_test)) ** 2).mean() p_value = hrt( feature, tstat, X_drug, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], )["p_value"] return p_value
def run_hrt( target_feature, X, y, features, model, pca_components=100, discrete_threshold=10, nbootstraps=100, nperms=5000, verbose=False, ): feature_idx = features.get_loc(target_feature) fmask = np.ones(X.shape[1], dtype=bool) fmask[feature_idx] = False X_transform = X[:, fmask] if pca_components is not None: from sklearn.decomposition import PCA pca = PCA(n_components=pca_components) X_transform = pca.fit_transform(X_transform) X_transform = np.concatenate( [X[:, feature_idx:feature_idx + 1], X_transform], axis=1) nunique = np.unique(X[:, feature_idx]).shape[0] if nunique <= discrete_threshold: if verbose: print("Using discrete conditional") results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps) else: if verbose: print("Using continuous conditional") results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps) conditional = results["sampler"] tstat = lambda X_test: ((y - model.predict(X_test))**2).mean() p_value = hrt( feature_idx, tstat, X, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], )["p_value"] return p_value
def hrt(feature, tstat_fn, X, X_test=None, nperms=None, verbose=False, conditional=None, nquantiles=101, nbootstraps=100, nfolds=5, ks_threshold=0.005, tv_threshold=0.005, p_threshold=0, lower=None, upper=None, save_nulls=False): '''Perform the heldout data randomization test. If conditional is not specified, it is inferred from the number of unique values, u, in the training set: u <= 0.25*N uses discrete; otherwise, continuous is used. If the conditional is not simply discrete or continuous (e.g. ordinal, discrete-continuous mixtures, bounded continuous, etc.) the user can specify a custom conditional function. For custom functions, no calibration will be performed to make sure the distribution matches-- this is left to the custom function. The custom function should return a tuple: (sample, probs) for the test dataset feature column, where sample is a random sample and probs is a 3-tuple of [median, lower, upper] corresponding to the middle probability estimate and its lower and upper confidence intervals. If the sampling model does not produce confidence intervals, one can simply return a 3-tuple of all 1s; this will correspond to an HRT under the assumption that the conditional distribution is the true distribution. ''' # Find the test type automatically if conditional is None: u = len(np.unique(X[:, feature])) conditional = 'discrete' if u <= 0.25 * X.shape[0] else 'continuous' if conditional == 'continuous': if verbose: print('Running continuous HRT') # Fit a robust conditional model for X_j X_train = np.concatenate([X, X_test], axis=0) if X_test is not None else X results = calibrate_continuous(X_train, feature, X_test=X_test, nquantiles=nquantiles, nbootstraps=nbootstraps, nfolds=nfolds, ks_threshold=ks_threshold, p_threshold=p_threshold) conditional = results['sampler'] # If no quantiles have been specified, use the auto-calibrated ones if lower is None: lower = np.array([results['lower']]) if upper is None: upper = np.array([results['upper']]) elif conditional == 'discrete': if verbose: print('Running discrete HRT') # Fit a robust conditional model for X_j X_train = np.concatenate([X, X_test], axis=0) if X_test is not None else X results = calibrate_discrete(X_train, feature, X_test=X_test, nquantiles=nquantiles, nbootstraps=nbootstraps, nfolds=nfolds, tv_threshold=tv_threshold, p_threshold=p_threshold) conditional = results['sampler'] # If no quantiles have been specified, use the auto-calibrated ones if lower is None: lower = np.array([results['lower']]) if upper is None: upper = np.array([results['upper']]) else: results = {'sampler': conditional} if verbose: print('Running HRT with custom conditional model') N = X.shape[0] P = X.shape[1] # Order of magnitude more permutations than data points by default if nperms is None: nperms = max(N * 10, P * 10) # If no quantiles have been chosen, assume we can use the median if lower is None: lower = np.array([50]) if upper is None: upper = np.array([50]) # If we were given scalar quantiles, convert them to 1d arrays if np.isscalar(lower): lower = np.array([lower]) if np.isscalar(upper): upper = np.array([upper]) results['lower'] = lower results['upper'] = upper # Set the quantiles from the bootstrap models quantiles = np.concatenate([lower, upper]) conditional.quantiles = quantiles # Get the test-statistic using the real data X_null = np.copy(X) if X_test is None else np.copy(X_test) t_true = tstat_fn(X_null) t_null = np.zeros(nperms) quants_null = np.zeros((nperms, quantiles.shape[0])) t_weights = np.zeros((nperms, len(lower), len(upper))) if save_nulls: X_null_samples = np.full((nperms, X.shape[0]), np.nan) for perm in range(nperms): if (perm % 500) == 0: print('Trial {}'.format(perm)) # Sample from the conditional null model X_null[:, feature], quants_null[perm] = conditional() # Save the null if desired if save_nulls: X_null_samples[perm] = X_null[:, feature] # Get the test-statistic under the null t_null[perm] = tstat_fn(X_null) if t_null[perm] <= t_true: # Over-estimate the likelihood t_weights[perm] = quants_null[perm, len(lower):][np.newaxis, :] else: # Under-estimate the likelihood t_weights[perm] = quants_null[perm, :len(lower)][:, np.newaxis] if verbose > 1: from utils import pretty_str print('t_true: {} t_null: {} weight:\n{}'.format( t_true, t_null[perm], pretty_str(np.exp(t_weights[perm])))) # Calculate the weights using a numerically stable approach that accounts for having very small probabilities # t_weights = np.exp(t_weights) # Calculate the p-value conservatively using the calibrated confidence weights results['p_value'] = np.squeeze(t_weights[t_null <= t_true].sum(axis=0) / t_weights.sum(axis=0)) results['t_stat'] = t_true results['t_null'] = t_null results['t_weights'] = np.squeeze(t_weights.sum(axis=0)) results['quantiles_null'] = quants_null if save_nulls: results['samples_null'] = X_null_samples return results