Esempio n. 1
0
def run_hrt(
    feat_idx,
    X_drug,
    y_drug,
    elastic_model,
    features,
    ccle_features,
    pca_components=100,
    discrete_threshold=10,
    nbootstraps=100,
    nperms=5000,
    verbose=False,
):
    gene_target = ccle_features[feat_idx]
    feature = features.get_loc(gene_target)
    nunique = np.unique(X_drug[:, feature]).shape[0]
    if verbose:
        print(
            "{} is feature number {} with {} unique values".format(
                gene_target, feature, nunique
            )
        )
    fmask = np.ones(X_drug.shape[1], dtype=bool)
    fmask[feature] = False
    X_transform = X_drug[:, fmask]
    from sklearn.decomposition import PCA

    pca = PCA(n_components=pca_components)
    X_transform = pca.fit_transform(X_transform)
    X_transform = np.concatenate(
        [X_drug[:, feature : feature + 1], X_transform], axis=1
    )
    if nunique <= discrete_threshold:
        if verbose:
            print("Using discrete conditional")
        results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps)
    else:
        if verbose:
            print("Using continuous conditional")
        results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps)
    conditional = results["sampler"]
    tstat = lambda X_test: ((y_drug - elastic_model.predict(X_test)) ** 2).mean()
    p_value = hrt(
        feature,
        tstat,
        X_drug,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
    )["p_value"]
    return p_value
Esempio n. 2
0
def run_hrt(
    target_feature,
    X,
    y,
    features,
    model,
    pca_components=100,
    discrete_threshold=10,
    nbootstraps=100,
    nperms=5000,
    verbose=False,
):
    feature_idx = features.get_loc(target_feature)
    fmask = np.ones(X.shape[1], dtype=bool)
    fmask[feature_idx] = False
    X_transform = X[:, fmask]
    if pca_components is not None:
        from sklearn.decomposition import PCA

        pca = PCA(n_components=pca_components)
        X_transform = pca.fit_transform(X_transform)
        X_transform = np.concatenate(
            [X[:, feature_idx:feature_idx + 1], X_transform], axis=1)
    nunique = np.unique(X[:, feature_idx]).shape[0]
    if nunique <= discrete_threshold:
        if verbose:
            print("Using discrete conditional")
        results = calibrate_discrete(X_transform, 0, nbootstraps=nbootstraps)
    else:
        if verbose:
            print("Using continuous conditional")
        results = calibrate_continuous(X_transform, 0, nbootstraps=nbootstraps)
    conditional = results["sampler"]
    tstat = lambda X_test: ((y - model.predict(X_test))**2).mean()
    p_value = hrt(
        feature_idx,
        tstat,
        X,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
    )["p_value"]
    return p_value
Esempio n. 3
0
def main():
    N = 500  # total number of samples
    P = 500  # number of features
    S = 40  # number of signal features
    nperms = 5000
    nbootstraps = 100
    fdr_threshold = 0.1
    trial = int(sys.argv[1])
    feature = int(sys.argv[2])
    intervals = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
    lower, upper = (50 - intervals), 50 + intervals
    reset_models = len(sys.argv) > 3 and "--reset-models" in sys.argv[3:]
    TRIAL_PATH = "data/{}".format(trial)
    X_PATH = "data/{}/X.csv".format(trial)
    Y_PATH = "data/{}/Y.csv".format(trial)
    TRUTH_PATH = "data/{}/truth.csv".format(trial)
    LINEAR_PATH = "data/{}/cv_linear.pt".format(trial)
    NONLINEAR_PATH = "data/{}/cv_nonlinear.pt".format(trial)
    P_LINEAR_PATH = "data/{}/sweep_robust_linear_p_values".format(trial)
    P_NONLINEAR_PATH = "data/{}/sweep_robust_nonlinear_p_values".format(trial)
    Pi_LINEAR_PATH = "data/{}/sweep_robust_linear_p_value_{}".format(
        trial, feature)
    Pi_NONLINEAR_PATH = "data/{}/sweep_robust_nonlinear_p_value_{}".format(
        trial, feature)

    X = np.loadtxt(X_PATH, delimiter=",")
    y = np.loadtxt(Y_PATH, delimiter=",")
    truth = np.loadtxt(TRUTH_PATH, delimiter=",")

    if reset_models:
        print("Fitting models with N={} P={} S={} nperms={}".format(
            N, P, S, nperms))
        sys.stdout.flush()
        linear_model = fit_cv(X, y, verbose=False, model_type="linear")
        nonlinear_model = fit_cv(X, y, verbose=False, model_type="nonlinear")
        torch.save(linear_model, LINEAR_PATH)
        torch.save(nonlinear_model, NONLINEAR_PATH)
    else:
        linear_model = torch.load(LINEAR_PATH)
        nonlinear_model = torch.load(NONLINEAR_PATH)

    linear_p_values = load_or_create(P_LINEAR_PATH, P, intervals)
    nonlinear_p_values = load_or_create(P_NONLINEAR_PATH, P, intervals)

    print(
        "Testing with N={} P={} S={} nperms={} nbootstraps={} interval=[{},{}]"
        .format(N, P, S, nperms, nbootstraps, lower, upper))

    # test statistics for the two models
    tstat_linear = lambda X_test: (
        (y - linear_model.predict(X_test))**2).mean()
    tstat_nonlinear = lambda X_test: (
        (y - nonlinear_model.predict(X_test))**2).mean()

    print("Feature: {}".format(feature))

    conditional = None
    linear_p_value = linear_p_values[feature]
    if np.any(np.isnan(linear_p_value)) and os.path.exists(Pi_LINEAR_PATH +
                                                           ".npy"):
        linear_p_value = np.load(Pi_LINEAR_PATH + ".npy")
    if np.any(np.isnan(linear_p_value)):
        print("Running linear robust CVR test")
        linear_results = hrt(
            feature,
            tstat_linear,
            X,
            nperms=nperms,
            nbootstraps=nbootstraps,
            conditional=conditional,
            lower=lower,
            upper=upper,
        )
        # Get the results and reuse the conditional model
        linear_p_value = linear_results["p_value"]
        conditional = linear_results["sampler"]
        np.save(Pi_LINEAR_PATH, linear_p_value)

    nonlinear_p_value = nonlinear_p_values[feature]
    if np.any(
            np.isnan(nonlinear_p_value)) and os.path.exists(Pi_NONLINEAR_PATH +
                                                            ".npy"):
        nonlinear_p_value = np.load(Pi_NONLINEAR_PATH + ".npy")
    if np.any(np.isnan(nonlinear_p_value)):
        print("Running nonlinear robust CVR test")
        nonlinear_results = hrt(
            feature,
            tstat_nonlinear,
            X,
            nperms=nperms,
            nbootstraps=nbootstraps,
            conditional=conditional,
            lower=lower,
            upper=upper,
        )
        nonlinear_p_value = nonlinear_results["p_value"]
        np.save(Pi_NONLINEAR_PATH, nonlinear_p_value)

    print(
        "p-values Robust CVR (linear): {}\nRobust CVR (nonlinear): {}".format(
            pretty_str(linear_p_value), pretty_str(nonlinear_p_value)))
    # print('t-weights Robust CVR (linear): {}\nRobust CVR (nonlinear): {}'.format(pretty_str(linear_results['t_weights'] / linear_results['t_weights'].mean()), pretty_str(nonlinear_results['t_weights']/nonlinear_results['t_weights'].mean())))

    # linear_predictions = bh_predictions(linear_p_values, fdr_threshold)
    # nonlinear_predictions = bh_predictions(nonlinear_p_values, fdr_threshold)

    # linear_tpr = tpr(truth, linear_predictions)
    # linear_fdr = fdr(truth, linear_predictions)
    # nonlinear_tpr = tpr(truth, nonlinear_predictions)
    # nonlinear_fdr = fdr(truth, nonlinear_predictions)

    # print('Robust cross-validation randomization test (linear)')
    # print('TPR: {:.2f}%'.format(linear_tpr*100))
    # print('FDR: {:.2f}%'.format(linear_fdr*100))
    # print('')
    # sys.stdout.flush()

    # print('Robust cross-validation randomization test (nonlinear)')
    # print('TPR: {:.2f}%'.format(nonlinear_tpr*100))
    # print('FDR: {:.2f}%'.format(nonlinear_fdr*100))
    # print('')
    # sys.stdout.flush()

    # if trial == 0:
    #     with sns.axes_style('white', {'legend.frameon': True}):
    #         plt.rc('font', weight='bold')
    #         plt.rc('grid', lw=3)
    #         plt.rc('lines', lw=2)
    #         plt.rc('axes', lw=2)
    #         plt.scatter(np.arange(P), linear_p_values, color='red', label='Linear CVR test')
    #         plt.scatter(np.arange(P), nonlinear_p_values, color='blue', label='Non-linear CVR test')
    #         plt.axvline(S + 0.5, ls='--', color='black')
    #         plt.xlabel('Feature index', fontsize=18, weight='bold')
    #         plt.ylabel('p-value', fontsize=18, weight='bold')
    #         legend_props = {'weight': 'bold', 'size': 14}
    #         plt.legend(loc='upper right', prop=legend_props)
    #         plt.savefig('plots/liang-p-values-cv.pdf', bbox_inches='tight')
    #         plt.close()

    #         plt.scatter(linear_p_values[:S], nonlinear_p_values[:S], color='orange', label='True signals')
    #         plt.scatter(linear_p_values[S:], nonlinear_p_values[S:], color='gray', label='True nulls')
    #         plt.xlabel('Linear CVR p-values', fontsize=18, weight='bold')
    #         plt.ylabel('Non-linear CVR p-values', fontsize=18, weight='bold')
    #         plt.plot([0,1],[0,1], color='blue')
    #         legend_props = {'weight': 'bold', 'size': 14}
    #         plt.legend(loc='upper left', prop=legend_props)
    #         plt.savefig('plots/liang-linear-vs-nonlinear-p-values-cv.pdf', bbox_inches='tight')
    #         plt.close()

    print("Done!")
    sys.stdout.flush()
Esempio n. 4
0
def run(trial, feature, reset=False):
    N = 500  # total number of samples
    P = 500  # number of features
    S = 40  # number of signal features
    nperms = 5000
    fdr_threshold = 0.1
    nfolds = 5

    X, y, truth = load_or_create_dataset(trial, N, P, S)

    np.random.seed(trial * P + feature)

    infos = [
        ModelInfo(trial, "Partial Least Squares", fit_pls, "pls"),
        ModelInfo(trial, "Lasso", fit_lasso_cv, "lasso"),
        ModelInfo(trial, "Elastic Net", fit_elastic_net_cv, "enet"),
        ModelInfo(trial, "Bayesian Ridge", fit_bridge, "bridge"),
        ModelInfo(trial, "Polynomial Kernel Ridge", fit_kridge, "kridge"),
        ModelInfo(trial, "RBF Support Vector", fit_svr, "svr"),
        ModelInfo(trial, "Random Forest", fit_forest, "rf")
        # ModelInfo(trial, 'Extra Trees', fit_extratrees, 'xtrees')
    ]

    folds = get_model(infos[0], X, y, create_folds(X, nfolds), reset).folds
    models = [get_model(info, X, y, folds, reset) for info in infos]

    # Create the test statistic for each model
    # tstats = [(lambda X_target: ((y - model.predict(X_target))**2).mean()) for model in models]

    # Load the conditional model for this feature
    conditional = get_conditional(trial, feature)

    # Run the normal CVRT for the first model, but save the null samples to
    # avoid recomputing them for the rest of the models.
    info, model = infos[0], models[0]
    tstat = lambda X_target: ((y - model.predict(X_target))**2).mean()
    print("Running CVRT for {}".format(info.name))
    results = hrt(
        feature,
        tstat,
        X,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
        save_nulls=True,
    )
    p_value = results["p_value"]
    print("p={}".format(p_value))
    np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value)

    # Get the relevant values from the full CVRT on the first model
    t_true = results["t_stat"]
    X_nulls = results["samples_null"]
    quantile_nulls = results["quantiles_null"]

    # Run the CVRTs for the remaining models using the same null samples
    X_null = np.copy(X)
    for info, model in zip(infos[1:], models[1:]):
        print("Running cached CVRT for {}".format(info.name))
        t_weights = np.full(nperms, np.nan)
        t_null = np.full(nperms, np.nan)
        tstat = lambda X_target: ((y - model.predict(X_target))**2).mean()
        t_true = tstat(X)
        for perm in range(nperms):
            if (perm % 500) == 0:
                print("Trial {}".format(perm))

            # Get the test-statistic under the null
            X_null[:, feature] = X_nulls[perm]
            t_null[perm] = tstat(X_null)
            if t_null[perm] <= t_true:
                # Over-estimate the likelihood
                t_weights[perm] = quantile_nulls[perm, 1]
            else:
                # Under-estimate the likelihood
                t_weights[perm] = quantile_nulls[perm, 0]

        p_value = t_weights[t_null <= t_true].sum() / t_weights.sum()
        print("p={}".format(p_value))
        np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature),
                p_value)
Esempio n. 5
0
def run(trial, feature, reset, cv, robust):
    N = 500  # total number of samples
    P = 500  # number of features
    S = 40  # number of signal features
    T = 100  # test sample size
    nperms = 5000
    fdr_threshold = 0.1

    model_prefix = "cv_" if cv else ""
    p_prefix = "cv_" if cv else ""
    p_prefix += "robust_" if robust else ""
    nbootstraps = 100 if robust else 1
    LINEAR_PATH = "data/{}/{}linear.pt".format(trial, model_prefix)
    NONLINEAR_PATH = "data/{}/{}nonlinear.pt".format(trial, model_prefix)
    P_PERM_PATH = "data/{}/{}perm_p_values".format(trial, p_prefix)
    P_LINEAR_PATH = "data/{}/{}linear_p_values".format(trial, p_prefix)
    P_NONLINEAR_PATH = "data/{}/{}nonlinear_p_values".format(trial, p_prefix)
    Pi_PERM_PATH = "data/{}/{}perm_p_values_{}".format(trial, p_prefix,
                                                       feature)
    Pi_LINEAR_PATH = "data/{}/{}linear_p_values_{}".format(
        trial, p_prefix, feature)
    Pi_NONLINEAR_PATH = "data/{}/{}nonlinear_p_values_{}".format(
        trial, p_prefix, feature)
    BOUNDS_LINEAR_PATH = "data/{}/{}linear_bounds_{}".format(
        trial, p_prefix, feature)
    BOUNDS_NONLINEAR_PATH = "data/{}/{}nonlinear_bounds_{}".format(
        trial, p_prefix, feature)
    CONDITIONAL_PATH = "data/{}/conditional_{}{}.pt".format(
        trial, p_prefix, feature)

    X, y, truth = load_or_create_dataset(trial, N, P, S)

    # Load the checkpoint if available
    if not reset and os.path.exists(LINEAR_PATH):
        linear_model = torch.load(LINEAR_PATH)
        nonlinear_model = torch.load(NONLINEAR_PATH)
    else:
        # Train the model
        print("Fitting models with N={} P={} S={} T={} nperms={}".format(
            N, P, S, T, nperms))
        sys.stdout.flush()
        if cv:
            print("Using CV models")
            linear_model = fit_cv(X, y, verbose=False, model_type="linear")
            nonlinear_model = fit_cv(X,
                                     y,
                                     verbose=False,
                                     model_type="nonlinear")
        else:
            print("Using holdout models")
            linear_model = fit_nn(X[:-T],
                                  y[:-T],
                                  verbose=False,
                                  model_type="linear")
            nonlinear_model = fit_nn(X[:-T],
                                     y[:-T],
                                     verbose=False,
                                     model_type="nonlinear")
        torch.save(linear_model, LINEAR_PATH)
        torch.save(nonlinear_model, NONLINEAR_PATH)

    # Track all the p-values
    perm_p_values = load_or_create(P_PERM_PATH, P) if not robust else None
    linear_p_values = load_or_create(P_LINEAR_PATH, P)
    nonlinear_p_values = load_or_create(P_NONLINEAR_PATH, P)

    # test statistics for the two models
    y_train = y if cv else y[:-T]
    y_test = y if cv else y[-T:]
    X_train = X if cv else X[:-T]
    X_test = None if cv else X[-T:]
    tstat_linear = lambda X_target: (
        (y_test - linear_model.predict(X_target))**2).mean()
    tstat_nonlinear = lambda X_target: (
        (y_test - nonlinear_model.predict(X_target))**2).mean()

    if trial == 0:
        import matplotlib

        matplotlib.use("Agg")
        import seaborn as sns

        with sns.axes_style("white", {"legend.frameon": True}):
            plt.rc("font", weight="bold")
            plt.rc("grid", lw=3)
            plt.rc("lines", lw=2)
            plt.rc("axes", lw=2)
            plt.scatter(y_train, nonlinear_model.predict(X_train))
            plt.plot([y.min(), y.max()], [y.min(), y.max()],
                     color="red",
                     ls="--")
            plt.xlabel("Truth", fontsize=18, weight="bold")
            plt.ylabel("Predicted", fontsize=18, weight="bold")
            plt.savefig(
                "plots/liang-nonlinear-fit{}.pdf".format("-cv" if cv else ""),
                bbox_inches="tight",
            )
            plt.close()

            plt.rc("font", weight="bold")
            plt.rc("grid", lw=3)
            plt.rc("lines", lw=2)
            plt.rc("axes", lw=2)
            plt.scatter(y_train, linear_model.predict(X_train))
            plt.plot([y.min(), y.max()], [y.min(), y.max()],
                     color="red",
                     ls="--")
            plt.xlabel("Truth", fontsize=18, weight="bold")
            plt.ylabel("Predicted", fontsize=18, weight="bold")
            plt.savefig(
                "plots/liang-linear-fit{}.pdf".format("-cv" if cv else ""),
                bbox_inches="tight",
            )
            plt.close()

    conditional = None
    lower = None
    upper = None
    perm_folds = nonlinear_model.folds if cv else None
    print("Feature: {}".format(feature))

    if not robust:
        print("Running permutation test")
        if np.isnan(
                perm_p_values[feature]) and not os.path.exists(Pi_PERM_PATH +
                                                               ".npy"):
            permer = PermutationConditional(X if cv else X[-T:], feature,
                                            perm_folds)
            perm_p_value = hrt(
                feature,
                tstat_nonlinear,
                X_train,
                X_test=X_test,
                nperms=nperms,
                conditional=permer,
            )["p_value"]
            np.save(Pi_PERM_PATH, perm_p_value)
            print("Trial {} feature {} {} {} permutation p={}".format(
                trial,
                feature,
                "robust" if robust else "",
                "cv" if cv else "",
                perm_p_value,
            ))

    print("Running linear HRT")
    if np.isnan(
            linear_p_values[feature]) and not os.path.exists(Pi_LINEAR_PATH +
                                                             ".npy"):
        linear_results = hrt(
            feature,
            tstat_linear,
            X_train,
            X_test=X_test,
            nperms=nperms,
            nbootstraps=nbootstraps,
            conditional=conditional,
        )
        linear_p_value = linear_results["p_value"]
        conditional = linear_results["sampler"]
        np.save(Pi_LINEAR_PATH, linear_p_value)
        print("Trial {} feature {} {} {} linear hrt p={}".format(
            trial,
            feature,
            "robust" if robust else "",
            "cv" if cv else "",
            linear_p_value,
        ))
        if robust:
            lower = linear_results["lower"]
            upper = linear_results["upper"]
            np.save(BOUNDS_LINEAR_PATH, np.concatenate([lower, upper]))

    print("Running nonlinear HRT")
    if np.isnan(nonlinear_p_values[feature]
                ) and not os.path.exists(Pi_NONLINEAR_PATH + ".npy"):
        nonlinear_results = hrt(
            feature,
            tstat_nonlinear,
            X_train,
            X_test=X_test,
            nperms=nperms,
            nbootstraps=nbootstraps,
            conditional=conditional,
            lower=lower,
            upper=upper,
        )
        nonlinear_p_value = nonlinear_results["p_value"]
        np.save(Pi_NONLINEAR_PATH, nonlinear_p_value)
        torch.save(nonlinear_results["sampler"], CONDITIONAL_PATH)
        print("Trial {} feature {} {} {} nonlinear hrt p={}".format(
            trial,
            feature,
            "robust" if robust else "",
            "cv" if cv else "",
            nonlinear_p_value,
        ))
        if robust:
            lower = nonlinear_results["lower"]
            upper = nonlinear_results["upper"]
            np.save(BOUNDS_NONLINEAR_PATH, np.concatenate([lower, upper]))

    print("")
    print("Done!")
    sys.stdout.flush()