Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        description="Worker script for the case study.")

    # Experiment settings
    parser.add_argument("min_feature", type=int, help="Min Feature ID")
    parser.add_argument(
        "max_feature",
        type=int,
        help="Max Feature ID. Set to -1 to go to the max feature ID.",
    )
    parser.add_argument("--nthreads",
                        type=int,
                        default=4,
                        help="Number of parallel workers to run.")
    parser.add_argument("--fdr_threshold",
                        type=float,
                        default=0.2,
                        help="Target false discovery rate.")

    # Get the arguments from the command line
    args = parser.parse_args()
    dargs = vars(args)

    torch.set_num_threads(1)  # bad torch, no biscuit

    # Load the data and the model
    print("Loading data")
    X_drug, y_drug, features, ccle_features, elastic_model = load_plx4720()

    if args.max_feature == -1:
        args.max_feature = len(ccle_features)

    # Get the weights to use in feature ranking
    model_weights = get_model_weights(elastic_model)

    # Build all the jobs
    jobs = [(
        feat_idx,
        X_drug,
        y_drug,
        elastic_model,
        features,
        ccle_features,
        model_weights,
    ) for feat_idx in range(args.min_feature,
                            min(args.max_feature + 1, len(ccle_features)))]

    print("Running {} jobs".format(len(jobs)))

    # Run in parallel (okay this is processes not threads, but who's counting?)
    with Pool(args.nthreads, initializer=seed_fn) as pool:
        p_values = np.array(pool.map(run_parallel, jobs))
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Worker script for the case study.')

    # Experiment settings
    descriptors = [
        'Bakery', 'Sour', 'Intensity', 'Sweet', 'Burnt', 'Pleasantness',
        'Fish', 'Fruit', 'Garlic', 'Spices', 'Cold', 'Acid', 'Warm', 'Musky',
        'Sweaty', 'Ammonia', 'Decayed', 'Wood', 'Grass', 'Flower', 'Chemical'
    ]
    parser.add_argument('min_feature', type=int, help='Min Feature ID')
    parser.add_argument('max_feature', type=int, help='Max Feature ID')
    parser.add_argument('--descriptor',
                        choices=descriptors,
                        default='Bakery',
                        help='The descriptor type to get p-values for.')
    parser.add_argument('--nthreads',
                        type=int,
                        default=4,
                        help='Number of parallel workers to run.')
    parser.add_argument('--fdr_threshold',
                        type=float,
                        default=0.2,
                        help='Target false discovery rate.')
    parser.add_argument(
        '--importance_threshold',
        type=float,
        default=1e-3,
        help=
        'Minimum heuristic feature importance to make a feature test-worthy.')

    # Get the arguments from the command line
    args = parser.parse_args()
    dargs = vars(args)

    torch.set_num_threads(1)  # bad torch, no biscuit

    # Load the data and the model
    print('Loading data')
    X, Y, descriptors, target_features = load_olfaction()
    features = X.columns

    print('Loading model')
    # Get the model and data specifically for this descriptor class
    x, y, forest_model = load_or_fit_model(args.descriptor, X, Y)

    # Handle an idiosyncracy of multiprocessing with sklearn random forests
    for m in forest_model.models:
        m.n_jobs = 1

    # Get the weights to use in feature ranking
    model_weights = get_model_weights(forest_model)

    print('Total: {}'.format(len(X.columns)))

    # Build all the jobs
    jobs = [
        (target_feature, x, y, features, forest_model, model_weights,
         args.descriptor)
        for target_feature, importance in zip(
            X.columns[args.min_feature:args.max_feature +
                      1], model_weights[args.min_feature:args.max_feature + 1])
        if importance >= args.importance_threshold
    ]

    print('Running {} jobs'.format(len(jobs)))

    # Run in parallel (okay this is processes not threads, but who's counting?)
    with Pool(args.nthreads, initializer=seed_fn) as pool:
        p_values = np.array(pool.map(run_parallel, jobs))

    # Multiple testing correction via Benjamini-Hochberg at a 20% FDR
    discoveries = bh(p_values, args.fdr_threshold)
    discovery_genes = features[discoveries]
    discovery_p = p_values[discoveries]
    discovery_weights = model_weights[discoveries]

    # Print the discoveries along with their model weights and p-values
    order = np.argsort(discovery_weights)[::-1]
    print('')
    print('Molecular Feature & Model Weight & $p$-value \\\\')
    for g, w, p in zip(discovery_genes[order], discovery_weights[order],
                       discovery_p[order]):
        print('{} & {} & {} \\\\'.format(g, w, p))
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Worker script for the case study.")

    # Experiment settings
    descriptors = [
        "Bakery",
        "Sour",
        "Intensity",
        "Sweet",
        "Burnt",
        "Pleasantness",
        "Fish",
        "Fruit",
        "Garlic",
        "Spices",
        "Cold",
        "Acid",
        "Warm",
        "Musky",
        "Sweaty",
        "Ammonia",
        "Decayed",
        "Wood",
        "Grass",
        "Flower",
        "Chemical",
    ]
    parser.add_argument("min_feature", type=int, help="Min Feature ID")
    parser.add_argument("max_feature", type=int, help="Max Feature ID")
    parser.add_argument(
        "--descriptor",
        choices=descriptors,
        default="Bakery",
        help="The descriptor type to get p-values for.",
    )
    parser.add_argument("--nthreads",
                        type=int,
                        default=4,
                        help="Number of parallel workers to run.")
    parser.add_argument("--fdr_threshold",
                        type=float,
                        default=0.2,
                        help="Target false discovery rate.")
    parser.add_argument(
        "--importance_threshold",
        type=float,
        default=1e-3,
        help=
        "Minimum heuristic feature importance to make a feature test-worthy.",
    )

    # Get the arguments from the command line
    args = parser.parse_args()
    dargs = vars(args)

    torch.set_num_threads(1)  # bad torch, no biscuit

    # Load the data and the model
    print("Loading data")
    X, Y, descriptors, target_features = load_olfaction()
    features = X.columns

    print("Loading model")
    # Get the model and data specifically for this descriptor class
    x, y, forest_model = load_or_fit_model(args.descriptor, X, Y)

    # Handle an idiosyncracy of multiprocessing with sklearn random forests
    for m in forest_model.models:
        m.n_jobs = 1

    # Get the weights to use in feature ranking
    model_weights = get_model_weights(forest_model)

    print("Total: {}".format(len(X.columns)))

    # Build all the jobs
    jobs = [(target_feature, x, y, features, forest_model, model_weights,
             args.descriptor) for target_feature, importance in zip(
                 X.columns[args.min_feature:args.max_feature + 1],
                 model_weights[args.min_feature:args.max_feature + 1],
             ) if importance >= args.importance_threshold]

    print("Running {} jobs".format(len(jobs)))

    # Run in parallel (okay this is processes not threads, but who's counting?)
    with Pool(args.nthreads, initializer=seed_fn) as pool:
        p_values = np.array(pool.map(run_parallel, jobs))

    # Multiple testing correction via Benjamini-Hochberg at a 20% FDR
    discoveries = bh(p_values, args.fdr_threshold)
    discovery_genes = features[discoveries]
    discovery_p = p_values[discoveries]
    discovery_weights = model_weights[discoveries]

    # Print the discoveries along with their model weights and p-values
    order = np.argsort(discovery_weights)[::-1]
    print("")
    print("Molecular Feature & Model Weight & $p$-value \\\\")
    for g, w, p in zip(discovery_genes[order], discovery_weights[order],
                       discovery_p[order]):
        print("{} & {} & {} \\\\".format(g, w, p))
Beispiel #4
0
import os
import argparse
import torch
from main import load_plx4720, run_hrt, CvModel, get_model_weights
from pyhrt.utils import bh

if __name__ == '__main__':
    fdr_threshold = 0.2
    importance_threshold = 1e-3

    # Load the data and the model
    print('Loading data')
    X_drug, y_drug, features, ccle_features, elastic_model = load_plx4720()

    # Get the weights to use in feature ranking
    model_weights = get_model_weights(elastic_model)

    all_p_path = 'data/p.npy'
    if os.path.exists(all_p_path):
        p_values = np.load(all_p_path)
    else:
        p_values = np.full(len(ccle_features), np.nan)
    for feat_idx in range(len(ccle_features)):
        if not np.isnan(p_values[feat_idx]):
            continue

        p_path = 'data/p_{}.npy'.format(feat_idx)

        # Check if we have already computed this p-value
        if os.path.exists(p_path):
            p_values[feat_idx] = np.load(p_path)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Worker script for the case study.')

    # Experiment settings
    descriptors = [
        'Bakery', 'Sour', 'Intensity', 'Sweet', 'Burnt', 'Pleasantness',
        'Fish', 'Fruit', 'Garlic', 'Spices', 'Cold', 'Acid', 'Warm', 'Musky',
        'Sweaty', 'Ammonia', 'Decayed', 'Wood', 'Grass', 'Flower', 'Chemical'
    ]
    parser.add_argument('--descriptor',
                        choices=descriptors,
                        default='Bakery',
                        help='The descriptor type to get p-values for.')
    parser.add_argument('--fdr_threshold',
                        type=float,
                        default=0.2,
                        help='Target false discovery rate.')
    parser.add_argument(
        '--importance_threshold',
        type=float,
        default=1e-3,
        help=
        'Minimum heuristic feature importance to make a feature test-worthy.')

    # Get the arguments from the command line
    args = parser.parse_args()
    dargs = vars(args)

    torch.set_num_threads(1)  # bad torch, no biscuit

    # Load the data and the model
    print('Loading data')
    X, Y, descriptors, target_features = load_olfaction()
    features = X.columns

    print('Loading model')
    # Get the model and data specifically for this descriptor class
    x, y, forest_model = load_or_fit_model(args.descriptor, X, Y)

    # Handle an idiosyncracy of multiprocessing with sklearn random forests
    for m in forest_model.models:
        m.n_jobs = 1

    # Get the weights to use in feature ranking
    model_weights = get_model_weights(forest_model)

    print('Total: {}'.format(len(X.columns)))

    all_p_path = 'data/p_{}.npy'.format(args.descriptor)
    if os.path.exists(all_p_path):
        p_values = np.load(all_p_path)
    else:
        p_values = np.full(len(features), np.nan)
    for feat_idx in range(len(features)):
        if not np.isnan(p_values[feat_idx]):
            continue

        p_path = 'data/p_{}_{}.npy'.format(args.descriptor, feat_idx)

        # Check if we have already computed this p-value
        if os.path.exists(p_path):
            p_values[feat_idx] = np.load(p_path)
            # print('p-value for {}: {}'.format(features[feat_idx], p_values[feat_idx]))
            continue

        # Check if the model assigns zero weight to this feature such that it can be ignored
        if np.abs(model_weights[feat_idx]) < args.importance_threshold:
            # print('p-value for {}: 1 (0 weight in model)'.format(features[feat_idx]))
            p_values[feat_idx] = 1
            continue

        print('************ Missing p-value for {} ************'.format(
            feat_idx))

    # Save the aggregated results
    np.save(all_p_path, p_values)

    # Print the top-ranked features by their heuristic weight
    for rank, (target_feature,
               importance) in enumerate(target_features[args.descriptor]):
        p_value = p_values[features.get_loc(target_feature)]
        print('{} & {:.4f} & {:.4f} \\\\'.format(
            target_feature.replace('\'', ''), importance, p_value))

    if np.any(np.isnan(p_values)):
        print('{} NaN p-values!'.format(np.isnan(p_values).sum()))
        missing = np.where(np.isnan(p_values))[0]
        print(missing)
        print([features[idx] for idx in missing])
        print('Setting to 1')
        p_values[np.isnan(p_values)] = 1

    # Only consider features with substantial heuristic feature importance
    important = model_weights >= args.importance_threshold
    p_important = p_values[important]

    print('{} features above importance threshold'.format(important.sum()))

    # Multiple testing correction via Benjamini-Hochberg at a 20% FDR
    # discoveries = bh(p_values, args.fdr_threshold) # Test all features
    discoveries = np.arange(len(features))[important][bh(
        p_important, args.fdr_threshold)]  # Test important features
    discovery_features = features[discoveries]
    discovery_p = p_values[discoveries]
    discovery_weights = model_weights[discoveries]

    # Print the discoveries along with their model weights and p-values
    order = np.argsort(np.abs(discovery_weights))[::-1]
    print('')
    print('Molecular Feature & Model Weight & $p$-value \\\\')
    for f, w, p in zip(discovery_features[order], discovery_weights[order],
                       discovery_p[order]):
        print('{} & {:.4f} & {:.4f} \\\\'.format(f, w, p))