Example #1
0
    p_values = defaultdict(lambda: np.full((ntrials, P), np.nan))
    tpr_vals = defaultdict(lambda: np.full(ntrials, np.nan))
    fdr_vals = defaultdict(lambda: np.full(ntrials, np.nan))
    for trial in range(ntrials):
        print(trial)
        TRUTH_PATH = "data/{}/truth.csv".format(trial)
        truth = np.loadtxt(TRUTH_PATH, delimiter=",")

        infos = [
            ModelInfo(trial, "Random Forest", None, "rf"),
            ModelInfo(trial, "Bayesian Ridge", None, "bridge"),
            ModelInfo(trial, "Elastic Net", None, "enet"),
            ModelInfo(trial, "Lasso", None, "lasso"),
        ]

        models = [get_model(info, None, None, None, False) for info in infos]

        # Load the p-values for the predictor models
        for info, model in zip(infos, models):
            # Get the heuristic ordering of the models
            if info.name == "Random Forest":
                importance = rf_importance(model.models)
            else:
                importance = linear_model_importance(model.models)

            # Get the p-values and add correction term
            all_p_filename = "data/{}/{}.npy".format(trial, info.prefix)
            p_values[info.name][trial] = np.load(all_p_filename)
            p_values[info.name][trial] = (p_values[info.name][trial] * nperms +
                                          1) / (nperms + 1)
Example #2
0
def run(trial):
    N = 500  # total number of samples
    P = 500  # number of features
    S = 40  # number of signal features
    T = 100  # test sample size
    fdr_threshold = 0.1

    X, y, truth = load_or_create_dataset(trial, N, P, S)
    np.random.seed(trial * P)

    infos = [
        ModelInfo(trial, "Partial Least Squares", None, "pls"),
        ModelInfo(trial, "Lasso", None, "lasso"),
        ModelInfo(trial, "Elastic Net", None, "enet"),
        ModelInfo(trial, "Bayesian Ridge", None, "bridge"),
        ModelInfo(trial, "Polynomial Kernel Ridge", None, "kridge"),
        ModelInfo(trial, "RBF Support Vector", fit_svr, "svr"),
        ModelInfo(trial, "Random Forest", None, "rf"),
    ]

    folds = get_model(infos[0], X, y, None, False).folds
    models = [get_model(info, X, y, folds, False) for info in infos]

    # Get the knockoffs for the OLS and neural net models
    LINEAR_PATH = "data/{}/cv_linear.pt".format(trial)
    NONLINEAR_PATH = "data/{}/cv_nonlinear.pt".format(trial)
    ols_model = torch.load(LINEAR_PATH)
    nn_model = torch.load(NONLINEAR_PATH)
    models.append(ols_model)
    models.append(nn_model)
    infos.append(ModelInfo(trial, "OLS", None, "linear"))
    infos.append(ModelInfo(trial, "Neural Net", None, "nonlinear"))

    # Generate a null sample for each feature
    X_null_path = "data/{}/X_knockoffs.npy".format(trial)
    if os.path.exists(X_null_path):
        X_null = np.load(X_null_path)
    else:
        print("\tCreating knockoffs")
        X_null = np.zeros_like(X)
        for j in range(X.shape[1]):
            print("\tFeature {}".format(j))
            # Load the conditional model for this feature
            conditional = get_conditional(trial, j)

            # Draw a sample from it
            X_null[:, j], _ = conditional()

            conditional = None
        np.save(X_null_path, X_null)

    for info, model in zip(infos, models):
        if os.path.exists("data/{}/{}_selected.npy".format(trial, info.prefix)):
            print("\tERK results for {} exist. Skipping...".format(info.name))
            continue
        print("\tRunning ERK for {}".format(info.name))

        # Create the model-specific test statistic (MSE)
        tstat = lambda X_target: ((y - model.predict(X_target)) ** 2).mean()

        # Run the knockoffs procedure
        selected, knockoff_stats = empirical_risk_knockoffs(
            X, tstat, fdr_threshold, X_null=X_null, verbose=False
        )

        np.save("data/{}/{}_selected.npy".format(trial, info.prefix), selected)
        np.save(
            "data/{}/{}_knockoff_stats.npy".format(trial, info.prefix), knockoff_stats
        )