Exemple #1
0
def p_tpr_fdr(p_values, intervals, truth, fdr_threshold=0.1):
    interval_tpr = np.full((intervals.shape[0], intervals.shape[0]), np.nan)
    interval_fdr = np.full(interval_tpr.shape, np.nan)
    for idx1, interval in enumerate(intervals):
        for idx2, interval in enumerate(intervals):
            p = p_values[:, idx1, idx2]
            if np.any(np.isnan(p)):
                continue
            p_indices = ~np.isnan(p)
            p = p[p_indices]
            t = truth[p_indices]
            d = bh_predictions(p, fdr_threshold)
            interval_tpr[idx1, idx2] = tpr(t, d)
            interval_fdr[idx1, idx2] = fdr(t, d)
    return interval_tpr, interval_fdr
Exemple #2
0
                importance = rf_importance(model.models)
            else:
                importance = linear_model_importance(model.models)

            # Get the p-values and add correction term
            all_p_filename = "data/{}/{}.npy".format(trial, info.prefix)
            p_values[info.name][trial] = np.load(all_p_filename)
            p_values[info.name][trial] = (p_values[info.name][trial] * nperms +
                                          1) / (nperms + 1)

            # Run the filtered testing procedure
            important = importance >= importance_threshold
            p_important = p_values[info.name][trial][important]
            print("\tFiltering down to {} features".format(important.sum()))
            pred = np.zeros(P, dtype=int)
            pred[important] = bh_predictions(p_important, fdr_threshold)
            tpr_vals[info.name][trial] = tpr(truth, pred)
            fdr_vals[info.name][trial] = fdr(truth, pred)

    labels = [info.name for info in infos]

    print("Plotting power and FDR results")
    results_plot(
        [tpr_vals[label] for label in labels],
        [fdr_vals[label] for label in labels],
        [label.replace(" ", "\n") for label in labels],
        fdr_threshold,
    )
    plt.savefig("plots/predictors-importance.pdf", bbox_inches="tight")
    plt.close()
Exemple #3
0
def add_if_finished(trial, all_p_values, all_tpr, all_fdr, truth, fdr_threshold):
    if not np.any(np.isnan(all_p_values[trial])):
        predictions = bh_predictions(all_p_values[trial], fdr_threshold)
        all_tpr[trial] = tpr(truth, predictions)
        all_fdr[trial] = fdr(truth, predictions)
Exemple #4
0
                    p_filename
                ):
                    p = np.load(p_filename)
                    p_values[info.name][trial, feature] = p
            np.save(all_p_filename, p_values[info.name][trial])
            p_values[info.name][trial] = (p_values[info.name][trial] * nperms + 1) / (
                nperms + 1
            )
            missing = np.isnan(p_values[info.name][trial])
            signal_p_values[info.name].extend(
                p_values[info.name][trial, :S][~missing[:S]]
            )
            p_trial = np.ones(P)
            p_trial[~missing] = p_values[info.name][trial][~missing]
            pred = bh_predictions(p_trial, fdr_threshold)
            tpr_vals[info.name][trial] = tpr(truth, pred)
            fdr_vals[info.name][trial] = fdr(truth, pred)

            if np.any(missing):
                if missing.sum() > 10:
                    print("Total missing: {}".format(missing.sum()))
                else:
                    print("Missing: {}".format(np.arange(P)[missing]))

        # Load the p-values for the other models
        for path, name in [
            ("cv_robust_nonlinear_p_values", "Neural Net"),
            ("cv_robust_linear_p_values", "OLS"),
        ]:
            p_trial = np.load("data/{}/{}.npy".format(trial, path))
            p_trial = (p_trial * nperms + 1) / (nperms + 1)
def main():
    N = 500 # total number of samples
    P = 500 # number of features
    S = 40 # number of signal features
    nperms = 5000
    nbootstraps = 100
    fdr_threshold = 0.1
    ntrials = 100
    names = ['Holdout\nPermutation', 'Calibrated\nHRT\n(linear)', 'Uncalibrated\nHRT',
             'CV\nPermutation', 'Calibrated\nCV-HRT\n(linear)', 'Uncalibrated\nCV-HRT',
             'Calibrated\nHRT\n(linear)', 'Calibrated\nHRT',
             'Calibrated\nCV-HRT\n(linear)', 'Calibrated\nCV-HRT']
    prefixes = ['perm', 'linear', 'nonlinear',
                'cv_perm', 'cv_linear', 'cv_nonlinear',
                'robust_linear', 'robust_nonlinear',
                'cv_robust_linear', 'cv_robust_nonlinear']
    p_values = np.full((len(prefixes), ntrials, P), np.nan)
    tpr_vals, fdr_vals = np.full((len(prefixes), ntrials), np.nan), np.full((len(prefixes), ntrials), np.nan)

    for idx, prefix in enumerate(prefixes):
        for trial in range(ntrials):
            if (trial % 25) == 0:
                print('{} trial: {}'.format(prefix, trial))
            TRUTH_PATH = 'data/{}/truth.csv'.format(trial)
            truth = np.loadtxt(TRUTH_PATH, delimiter=',')

            P_VALUE_PATH = 'data/{}/{}_p_values.npy'.format(trial, prefix)
            if os.path.exists(P_VALUE_PATH):
                p_values[idx, trial] = np.load(P_VALUE_PATH)
        
            clean_up_needed = False
            if np.any(np.isnan(p_values[idx,trial])):
                clean_up_needed = True
                for feature in range(P):
                    Pi_PATH = 'data/{}/{}_p_values_{}.npy'.format(trial, prefix, feature)
                    if np.isnan(p_values[idx, trial, feature]) and os.path.exists(Pi_PATH):
                        try:
                            p_values[idx,trial,feature] = np.load(Pi_PATH)
                        except:
                            os.remove(Pi_PATH)
            
            # p_values[idx, trial] = p_values[idx, trial] * nperms / (nperms+1)
            missing = np.isnan(p_values[idx, trial])
            pred = bh_predictions(p_values[idx, trial][~missing], fdr_threshold)
            tpr_vals[idx, trial] = tpr(truth[~missing], pred)
            fdr_vals[idx, trial] = fdr(truth[~missing], pred)

            if not np.any(np.isnan(p_values[idx,trial])):
                # clean up
                if clean_up_needed:
                    np.save(P_VALUE_PATH, p_values[idx,trial])
                    for feature in range(P):
                        Pi_PATH = 'data/{}/{}_p_values_{}.npy'.format(trial, prefix, feature)
                        if os.path.exists(Pi_PATH):
                            # print('Would delete {}'.format((idx, trial, feature)))
                            os.remove(Pi_PATH)
            else:
                print('Trial {} Nulls: {}'.format(trial, np.where(np.isnan(p_values[idx, trial]))[0]))

        if 'robust' in prefix:
            # Get the distribution of confidence intervals
            bounds = np.full((ntrials, P, 2), np.nan)
            for trial in range(ntrials):
                BOUNDS_PATH = 'data/{}/{}_bounds.npy'.format(trial, prefix)
                if os.path.exists(BOUNDS_PATH):
                    bounds[trial] = np.load(BOUNDS_PATH)
                
                clean_up_needed = False
                if np.any(np.isnan(bounds[trial])):
                    clean_up_needed = True
                    for feature in range(P):
                        BOUNDS_i_PATH = 'data/{}/{}_bounds_{}.npy'.format(trial, prefix, feature)
                        if np.any(np.isnan(bounds[trial, feature])) and os.path.exists(BOUNDS_i_PATH):
                            bounds[trial,feature] = np.load(BOUNDS_i_PATH)
                
                if not np.any(np.isnan(bounds[trial])):
                    # clean up
                    if clean_up_needed:
                        np.save(BOUNDS_PATH, bounds[trial])
                        for feature in range(P):
                            BOUNDS_i_PATH = 'data/{}/{}_bounds_{}.npy'.format(trial, prefix, feature)
                            if os.path.exists(BOUNDS_i_PATH):
                                # print('Would delete {}'.format(BOUNDS_i_PATH))
                                os.remove(BOUNDS_i_PATH)

            bounds_plot(bounds)
            plt.savefig('plots/liang-bounds-{}.pdf'.format(prefix.replace('_', '-')), bbox_inches='tight')


        print('*** {} model ({} trials) ***'.format(names[idx], (~np.isnan(tpr_vals[idx])).sum()))
        print('TPR: {:.2f}%'.format(np.nanmean(tpr_vals[idx], axis=0)*100))
        print('FDR: {:.2f}%'.format(np.nanmean(fdr_vals[idx], axis=0)*100))
        print('')
    
        p_plot(p_values[idx], S)
        plt.savefig('plots/liang-p-values-{}.pdf'.format(prefix.replace('_','-')), bbox_inches='tight')

    selected = np.array([0,3,2,5,7,9])
    results_plot([tpr_vals[i] for i in selected],
                 [fdr_vals[i] for i in selected],
                 [names[i] for i in selected],
                 fdr_threshold)
    plt.savefig('plots/liang-tpr-fdr.pdf', bbox_inches='tight')