def test_turf_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): TuRF with ReliefF works in a sklearn pipeline when TuRF is parallelized""" np.random.seed(49082) clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, fit_params={ 'turf__headers': headers}, cv=3)) > 0.7
def EPIMUTESTR(X=None, top_features=200, nn=None, discrete_threshold=10, verbose=False, n_cores=1, estimator='relief', pct=0.5): X = X.loc[:, (X != 0).any(axis=0)] features, labels = X.drop('class', axis=1).values, X['class'].values features = np.nan_to_num(features) headers = list(X.drop("class", axis=1)) if nn == None: nn = math.floor(0.154 * (X.shape[1] - 1)) if (estimator == 'TuRF'): # Total Unduplicated Reach and Frequency (TURF) fs = TuRF(core_algorithm="ReliefF", n_features_to_select=top_features, n_neighbors=nn, pct=pct, verbose=verbose, n_jobs=n_cores) fs.fit(features, labels, headers) elif (estimator == 'relief'): # ReliefF stand alone fs = ReliefF(n_features_to_select=top_features, n_neighbors=nn, discrete_threshold=discrete_threshold, verbose=verbose, n_jobs=n_cores) fs.fit(features, labels) scoreDict = dict( zip(X.drop('class', axis=1).columns, fs.feature_importances_)) scoreDict_sorted = { i[1]: i[0] for i in sorted(zip(scoreDict.values(), scoreDict.keys()), reverse=True) } scores_list = list(scoreDict_sorted.values()) pos_scores_list = [n for n in scores_list if n > 0] # calculate the P value and adjusted P value gene_scores = np.sqrt(pos_scores_list) gene_scores_mean = np.mean(gene_scores) gene_scores_sd = np.std(gene_scores) pvals = [] for score in gene_scores: pvals.append( scipy.stats.norm(gene_scores_mean, gene_scores_sd).sf(score)) # Benjamini/Hachberg FDR correction qvals = fdr.multipletests(np.asarray(pvals), method='fdr_bh', is_sorted=True) geneList = dict(zip(scoreDict_sorted.keys(), zip(pvals, qvals[1]))) return geneList
def test_turfpercent_pipeline(): """Ensure that TuRF with % neighbors works in a sklearn pipeline when it is parallelized""" np.random.seed(49082) clf = make_pipeline( TuRF(core_algorithm="MultiSURF", n_features_to_select=2, step=0.4, n_neighbors=0.1, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features, labels, fit_params={'turf__headers': headers }, cv=3)) > 0.7
def test_turf_pipeline_mixed_attributes(): """Ensure that TuRF works in a sklearn pipeline with mixed attributes""" np.random.seed(320931) clf = make_pipeline( TuRF(core_algorithm="MultiSURF", n_features_to_select=2, step=0.4, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, fit_params={'turf__headers': headers_mixed_attributes}, cv=3)) > 0.7
def test_turf_init(): """Check: TuRF constructor stores custom values correctly""" clf = TuRF(core_algorithm="MultiSURF", n_features_to_select=7, n_neighbors=500, pct=0.5, discrete_threshold=20, verbose=True, n_jobs=3) assert clf.core_algorithm == "MultiSURF" assert clf.n_features_to_select == 7 assert clf.n_neighbors == 500 assert clf.pct == 0.5 assert clf.discrete_threshold == 20 assert clf.verbose == True assert clf.n_jobs == 3
def test_turf_pipeline_cont_endpoint(): """Ensure that TuRF works in a sklearn pipeline with continuous endpoint data""" np.random.seed(320931) clf = make_pipeline( TuRF(core_algorithm="MultiSURF", n_features_to_select=2, step=0.4, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs( np.mean( cross_val_score( clf, features_cont_endpoint, labels_cont_endpoint, fit_params={'turf__headers': headers_cont_endpoint}, cv=3))) < 0.5
from sklearn.ensemble import RandomForestClassifier import tensorflow as tf from skrebate.turf import TuRF import scipy.sparse print("Loading files...") # Code for loading files from a google cloud storage bucket # with tf.gfile.GFile("gs://keen-scion-203518-ml/blood_type_Rh_no_filter_no_augmentation_X.npz") as f: # sparse_X = scipy.sparse.load_npz(f) # X = sparse_X.toarray() # with tf.gfile.GFile("gs://keen-scion-203518-ml/blood_type_Rh_no_filter_no_augmentation_y.npy") as f: # y = np.load(f) sparse_X = scipy.sparse.load_npz("blood_type_Rh_no_filter_no_augmentation_X.npz") X = sparse_X.toarray()[:,:1000] y = np.load("blood_type_Rh_no_filter_no_augmentation_y.npy") print("Making pipeline...") r = TuRF(core_algorithm="ReliefF", n_features_to_select=1000, pct=0.5, verbose=True) print("Fitting dataset and labels...") r.fit(X, y, range(X.shape[1])) np.save("feature_importances.npy", r.feature_importances_) np.save("top_features.npy", r.top_features_) print("Complete!")