Example #1
0
def run_hrt(
    feat_idx,
    X_drug,
    y_drug,
    elastic_model,
    features,
    ccle_features,
    pca_components=100,
    discrete_threshold=10,
    nbootstraps=100,
    nperms=5000,
    verbose=False,
):
    gene_target = ccle_features[feat_idx]
    feature = features.get_loc(gene_target)
    nunique = np.unique(X_drug[:, feature]).shape[0]
    if verbose:
        print(
            "{} is feature number {} with {} unique values".format(
                gene_target, feature, nunique
            )
        )
    fmask = np.ones(X_drug.shape[1], dtype=bool)
    fmask[feature] = False
    X_transform = X_drug[:, fmask]
    from sklearn.decomposition import PCA

    pca = PCA(n_components=pca_components)
    X_transform = pca.fit_transform(X_transform)
    X_transform = np.concatenate(
        [X_drug[:, feature : feature + 1], X_transform], axis=1
    )
    if nunique <= discrete_threshold:
        if verbose:
            print("Using discrete conditional")
        results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps)
    else:
        if verbose:
            print("Using continuous conditional")
        results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps)
    conditional = results["sampler"]
    tstat = lambda X_test: ((y_drug - elastic_model.predict(X_test)) ** 2).mean()
    p_value = hrt(
        feature,
        tstat,
        X_drug,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
    )["p_value"]
    return p_value
Example #2
0
def run_hrt(
    target_feature,
    X,
    y,
    features,
    model,
    pca_components=100,
    discrete_threshold=10,
    nbootstraps=100,
    nperms=5000,
    verbose=False,
):
    feature_idx = features.get_loc(target_feature)
    fmask = np.ones(X.shape[1], dtype=bool)
    fmask[feature_idx] = False
    X_transform = X[:, fmask]
    if pca_components is not None:
        from sklearn.decomposition import PCA

        pca = PCA(n_components=pca_components)
        X_transform = pca.fit_transform(X_transform)
        X_transform = np.concatenate(
            [X[:, feature_idx:feature_idx + 1], X_transform], axis=1)
    nunique = np.unique(X[:, feature_idx]).shape[0]
    if nunique <= discrete_threshold:
        if verbose:
            print("Using discrete conditional")
        results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps)
    else:
        if verbose:
            print("Using continuous conditional")
        results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps)
    conditional = results["sampler"]
    tstat = lambda X_test: ((y - model.predict(X_test))**2).mean()
    p_value = hrt(
        feature_idx,
        tstat,
        X,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
    )["p_value"]
    return p_value
Example #3
0
def hrt(feature,
        tstat_fn,
        X,
        X_test=None,
        nperms=None,
        verbose=False,
        conditional=None,
        nquantiles=101,
        nbootstraps=100,
        nfolds=5,
        ks_threshold=0.005,
        tv_threshold=0.005,
        p_threshold=0,
        lower=None,
        upper=None,
        save_nulls=False):
    '''Perform the heldout data randomization test. If conditional is not specified, it is inferred from
    the number of unique values, u, in the training set: u <= 0.25*N uses discrete; otherwise, continuous
    is used. If the conditional is not simply discrete or continuous (e.g. ordinal, discrete-continuous
    mixtures, bounded continuous, etc.) the user can specify a custom conditional function.

    For custom functions, no calibration will be performed to make sure the distribution matches-- this is left
    to the custom function. The custom function should return a tuple: (sample, probs) for the test dataset
    feature column, where sample is a random sample and probs is a 3-tuple of [median, lower, upper]
    corresponding to the middle probability estimate and its lower and upper confidence intervals.
    If the sampling model does not produce confidence intervals, one can simply return a 3-tuple of
    all 1s; this will correspond to an HRT under the assumption that the conditional distribution
    is the true distribution.
    '''

    # Find the test type automatically
    if conditional is None:
        u = len(np.unique(X[:, feature]))
        conditional = 'discrete' if u <= 0.25 * X.shape[0] else 'continuous'

    if conditional == 'continuous':
        if verbose:
            print('Running continuous HRT')
        # Fit a robust conditional model for X_j
        X_train = np.concatenate([X, X_test],
                                 axis=0) if X_test is not None else X
        results = calibrate_continuous(X_train,
                                       feature,
                                       X_test=X_test,
                                       nquantiles=nquantiles,
                                       nbootstraps=nbootstraps,
                                       nfolds=nfolds,
                                       ks_threshold=ks_threshold,
                                       p_threshold=p_threshold)
        conditional = results['sampler']

        # If no quantiles have been specified, use the auto-calibrated ones
        if lower is None:
            lower = np.array([results['lower']])
        if upper is None:
            upper = np.array([results['upper']])
    elif conditional == 'discrete':
        if verbose:
            print('Running discrete HRT')
        # Fit a robust conditional model for X_j
        X_train = np.concatenate([X, X_test],
                                 axis=0) if X_test is not None else X
        results = calibrate_discrete(X_train,
                                     feature,
                                     X_test=X_test,
                                     nquantiles=nquantiles,
                                     nbootstraps=nbootstraps,
                                     nfolds=nfolds,
                                     tv_threshold=tv_threshold,
                                     p_threshold=p_threshold)
        conditional = results['sampler']

        # If no quantiles have been specified, use the auto-calibrated ones
        if lower is None:
            lower = np.array([results['lower']])
        if upper is None:
            upper = np.array([results['upper']])
    else:
        results = {'sampler': conditional}
        if verbose:
            print('Running HRT with custom conditional model')

    N = X.shape[0]
    P = X.shape[1]

    # Order of magnitude more permutations than data points by default
    if nperms is None:
        nperms = max(N * 10, P * 10)

    # If no quantiles have been chosen, assume we can use the median
    if lower is None:
        lower = np.array([50])
    if upper is None:
        upper = np.array([50])

    # If we were given scalar quantiles, convert them to 1d arrays
    if np.isscalar(lower):
        lower = np.array([lower])
    if np.isscalar(upper):
        upper = np.array([upper])

    results['lower'] = lower
    results['upper'] = upper

    # Set the quantiles from the bootstrap models
    quantiles = np.concatenate([lower, upper])
    conditional.quantiles = quantiles

    # Get the test-statistic using the real data
    X_null = np.copy(X) if X_test is None else np.copy(X_test)
    t_true = tstat_fn(X_null)
    t_null = np.zeros(nperms)
    quants_null = np.zeros((nperms, quantiles.shape[0]))
    t_weights = np.zeros((nperms, len(lower), len(upper)))
    if save_nulls:
        X_null_samples = np.full((nperms, X.shape[0]), np.nan)
    for perm in range(nperms):
        if (perm % 500) == 0:
            print('Trial {}'.format(perm))
        # Sample from the conditional null model
        X_null[:, feature], quants_null[perm] = conditional()

        # Save the null if desired
        if save_nulls:
            X_null_samples[perm] = X_null[:, feature]

        # Get the test-statistic under the null
        t_null[perm] = tstat_fn(X_null)

        if t_null[perm] <= t_true:
            # Over-estimate the likelihood
            t_weights[perm] = quants_null[perm, len(lower):][np.newaxis, :]
        else:
            # Under-estimate the likelihood
            t_weights[perm] = quants_null[perm, :len(lower)][:, np.newaxis]

        if verbose > 1:
            from utils import pretty_str
            print('t_true: {} t_null: {} weight:\n{}'.format(
                t_true, t_null[perm], pretty_str(np.exp(t_weights[perm]))))

    # Calculate the weights using a numerically stable approach that accounts for having very small probabilities
    # t_weights = np.exp(t_weights)

    # Calculate the p-value conservatively using the calibrated confidence weights
    results['p_value'] = np.squeeze(t_weights[t_null <= t_true].sum(axis=0) /
                                    t_weights.sum(axis=0))
    results['t_stat'] = t_true
    results['t_null'] = t_null
    results['t_weights'] = np.squeeze(t_weights.sum(axis=0))
    results['quantiles_null'] = quants_null
    if save_nulls:
        results['samples_null'] = X_null_samples

    return results