Example #1
0
def test_turf_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): TuRF with ReliefF works in a sklearn pipeline when TuRF is parallelized"""
    np.random.seed(49082)

    clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, fit_params={
                   'turf__headers': headers}, cv=3)) > 0.7
Example #2
0
def EPIMUTESTR(X=None,
               top_features=200,
               nn=None,
               discrete_threshold=10,
               verbose=False,
               n_cores=1,
               estimator='relief',
               pct=0.5):
    X = X.loc[:, (X != 0).any(axis=0)]
    features, labels = X.drop('class', axis=1).values, X['class'].values
    features = np.nan_to_num(features)
    headers = list(X.drop("class", axis=1))
    if nn == None:
        nn = math.floor(0.154 * (X.shape[1] - 1))
    if (estimator == 'TuRF'):
        # Total Unduplicated Reach and Frequency (TURF)
        fs = TuRF(core_algorithm="ReliefF",
                  n_features_to_select=top_features,
                  n_neighbors=nn,
                  pct=pct,
                  verbose=verbose,
                  n_jobs=n_cores)
        fs.fit(features, labels, headers)
    elif (estimator == 'relief'):
        # ReliefF stand alone
        fs = ReliefF(n_features_to_select=top_features,
                     n_neighbors=nn,
                     discrete_threshold=discrete_threshold,
                     verbose=verbose,
                     n_jobs=n_cores)
        fs.fit(features, labels)

    scoreDict = dict(
        zip(X.drop('class', axis=1).columns, fs.feature_importances_))
    scoreDict_sorted = {
        i[1]: i[0]
        for i in sorted(zip(scoreDict.values(), scoreDict.keys()),
                        reverse=True)
    }
    scores_list = list(scoreDict_sorted.values())
    pos_scores_list = [n for n in scores_list if n > 0]
    # calculate the P value and adjusted P value
    gene_scores = np.sqrt(pos_scores_list)
    gene_scores_mean = np.mean(gene_scores)
    gene_scores_sd = np.std(gene_scores)
    pvals = []
    for score in gene_scores:
        pvals.append(
            scipy.stats.norm(gene_scores_mean, gene_scores_sd).sf(score))
    # Benjamini/Hachberg FDR correction
    qvals = fdr.multipletests(np.asarray(pvals),
                              method='fdr_bh',
                              is_sorted=True)

    geneList = dict(zip(scoreDict_sorted.keys(), zip(pvals, qvals[1])))

    return geneList
Example #3
0
def test_turfpercent_pipeline():
    """Ensure that TuRF with % neighbors works in a sklearn pipeline when it is parallelized"""
    np.random.seed(49082)

    clf = make_pipeline(
        TuRF(core_algorithm="MultiSURF",
             n_features_to_select=2,
             step=0.4,
             n_neighbors=0.1,
             n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features, labels, fit_params={'turf__headers': headers
                                               }, cv=3)) > 0.7
Example #4
0
def test_turf_pipeline_mixed_attributes():
    """Ensure that TuRF works in a sklearn pipeline with mixed attributes"""
    np.random.seed(320931)

    clf = make_pipeline(
        TuRF(core_algorithm="MultiSURF",
             n_features_to_select=2,
             step=0.4,
             n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(clf,
                        features_mixed_attributes,
                        labels_mixed_attributes,
                        fit_params={'turf__headers': headers_mixed_attributes},
                        cv=3)) > 0.7
Example #5
0
def test_turf_init():
    """Check: TuRF constructor stores custom values correctly"""
    clf = TuRF(core_algorithm="MultiSURF", n_features_to_select=7,
               n_neighbors=500,
               pct=0.5,
               discrete_threshold=20,
               verbose=True,
               n_jobs=3)

    assert clf.core_algorithm == "MultiSURF"
    assert clf.n_features_to_select == 7
    assert clf.n_neighbors == 500
    assert clf.pct == 0.5
    assert clf.discrete_threshold == 20
    assert clf.verbose == True
    assert clf.n_jobs == 3
Example #6
0
def test_turf_pipeline_cont_endpoint():
    """Ensure that TuRF works in a sklearn pipeline with continuous endpoint data"""
    np.random.seed(320931)

    clf = make_pipeline(
        TuRF(core_algorithm="MultiSURF",
             n_features_to_select=2,
             step=0.4,
             n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(
        np.mean(
            cross_val_score(
                clf,
                features_cont_endpoint,
                labels_cont_endpoint,
                fit_params={'turf__headers': headers_cont_endpoint},
                cv=3))) < 0.5
Example #7
0
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from skrebate.turf import TuRF
import scipy.sparse

print("Loading files...")

# Code for loading files from a google cloud storage bucket

# with tf.gfile.GFile("gs://keen-scion-203518-ml/blood_type_Rh_no_filter_no_augmentation_X.npz") as f:
#     sparse_X = scipy.sparse.load_npz(f)
#     X = sparse_X.toarray()

# with tf.gfile.GFile("gs://keen-scion-203518-ml/blood_type_Rh_no_filter_no_augmentation_y.npy") as f:
#     y = np.load(f)

sparse_X = scipy.sparse.load_npz("blood_type_Rh_no_filter_no_augmentation_X.npz")
X = sparse_X.toarray()[:,:1000]
y = np.load("blood_type_Rh_no_filter_no_augmentation_y.npy")

print("Making pipeline...")
r = TuRF(core_algorithm="ReliefF", n_features_to_select=1000, pct=0.5, verbose=True)

print("Fitting dataset and labels...")
r.fit(X, y, range(X.shape[1]))

np.save("feature_importances.npy", r.feature_importances_)
np.save("top_features.npy", r.top_features_)

print("Complete!")