def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fdr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives) return false_discovery_rate
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
class f_regressionFDRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR') self.id = 34 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fdr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def gene_univariate_feature_selection(self, alpha=0.01): gene_normal_X, gene_normal_Y = self.make_dataset( dataset='gene', normal_tumor='normal', normal_matched=True, mirna_gene_matched=True) gene_tumor_X, gene_tumor_Y = self.make_dataset(dataset='gene', normal_tumor='tumor', normal_matched=True, mirna_gene_matched=True) gene_exp_filter = SelectFdr(f_classif, alpha=alpha) gen_exp_new = gene_exp_filter.fit_transform( X=pandas.concat([gene_normal_X, gene_tumor_X]), y=pandas.concat([gene_normal_Y, gene_tumor_Y])) self.gene_symbols = np.asanyarray( self.gene_symbols)[gene_exp_filter.get_support( indices=True)].tolist() self.gene_tumor = self.gene_tumor[ self.gene_symbols + ['patient_barcode', 'pathologic_stage', 'histological_type']] self.gene_normal = self.gene_normal[ self.gene_symbols + ['patient_barcode', 'pathologic_stage', 'histological_type']]
class UnivariateSelectChiFDRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR') self.id = 31 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def SelectFdr_selector(data, target, sf): selector = SelectFdr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def feature_select(labels, features, alfa=0.4): dct = DecisionTreeClassifier(random_state=42) rfecv1 = RFECV(estimator=dct, step=1, cv=StratifiedKFold(labels, n_folds=6, shuffle=True, random_state=42), scoring='recall') rfecv2 = RFECV(estimator=dct, step=1, cv=StratifiedKFold(labels, n_folds=6, shuffle=True, random_state=42), scoring='precision') rfecv1.fit(features, labels) rfecv2.fit(features, labels) print("Optimal number of features - Recall : %d" % rfecv1.n_features_) print("Optimal number of features - Precision : %d" % rfecv2.n_features_) BestFeatures = SelectFdr(score_func=f_classif, alpha=alfa) BestFeatures.fit_transform(features, labels) # BestFeatures = SelectKBest(score_func=f_classif,k=numbest) # BestFeatures.fit_transform(features,labels) feature_scores = BestFeatures.scores_ feature_pvalues = BestFeatures.pvalues_ best_feat_indices = BestFeatures.get_support(indices=True) best_list = [] for i in range(len(best_feat_indices)): best_list.append(features_list[best_feat_indices[i] + 1]) print 'Best features:', best_list feat_ctr = -1 for index in best_feat_indices: feat_ctr += 1 print best_list[feat_ctr], 'Score:', feature_scores[ index], 'P-value:', feature_pvalues[index] plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Recall & Precision") plt.plot(range(1, len(rfecv1.grid_scores_) + 1), rfecv1.grid_scores_, label='Recall', color='blue') plt.plot(range(1, len(rfecv2.grid_scores_) + 1), rfecv2.grid_scores_, label='Precision', color='green') plt.legend() plt.show()
def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def select_fdr(df, target_col): y = df[target_col] X = df.drop(target_col, axis=1) selector = SelectFdr(chi2, alpha=0.01).fit(X, y) true_list = list(selector.get_support()) index = [i for i in range(len(true_list)) if true_list[i] == True] if len(index) == 0: print( 'No features were selected: either the data is too noisy or the selection Test_data too strict.' ) return df else: saved_columns = [list(X.columns)[i] for i in index] result = pd.DataFrame(selector.transform(X), columns=saved_columns) result[target_col] = y return result
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def select_f(self, score_function, has_pvalue): """ Select features using FDR (False Discovery Rate) or K-best """ fname = score_function.__name__ if has_pvalue: self._debug(f"Select FDR: '{fname}'") select = SelectFdr(score_func=score_function) else: self._debug(f"Select K-Best: '{fname}'") select = SelectKBest(score_func=score_function, k='all') self._debug( f"Select '{fname}': x.shape={self.x.shape}, y.shape={self.y.shape}" ) select.fit(self.x, self.y) keep = select.get_support() if has_pvalue: return (fname, select.scores_, select.pvalues_, keep) else: return (fname, select.scores_, None, None)
def svm_cv(data, data_target): X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target) print "*" * 79 print "Training..." # selector = SelectFdr(chi2) selector = SelectFdr(f_classif) selector.fit(X_train, y_train) clf = svm.SVC(kernel='linear', probability=True) clf.fit(selector.transform(X_train), y_train) print "Testing..." pred = clf.predict(selector.transform(X_test)) probs = pred.predict_proba(selector.transfrom(X_test)) accuracy_score = metrics.accuracy_score(y_test, pred) classification_report = metrics.classification_report(y_test, pred) support = selector.get_support() print support print accuracy_score print classification_report precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X) X = imputer.transform(X) # feature scaling from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X_norm = mms.fit_transform(X) # Univariate feature selection using false discovery rate from sklearn.feature_selection import SelectFdr, f_classif X_fdr = SelectFdr(f_classif, alpha=0.05).fit(X, y) # Get indices of selected features X_fdr.get_support(indices=True) # select features using false discovery rate method X_fdr = SelectFdr(f_classif, alpha=0.05).fit_transform(X, y) print(X_fdr.shape) # Splitting the dataset into Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_fdr, y, test_size=0.2, random_state=0) # fitting logistic regression to Training Set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0)
ft_model = load_embedding(FLAGS.embedfile) docs = [c.split(' ') for c in comments_text] for i in range(len(docs)): docs[i] = [t for t in docs[i] if t in ft_model.vocab] print('Building dictionary...') comments_dictionary = Dictionary(docs) comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = corpus2csc(comments_tfidf).T print('Finding important terms...') labelcols = data.columns.tolist()[2:] terms = Counter() for l in labelcols: cl = data[l] model_fdr = SelectFdr(chi2, alpha=0.025) model_fdr.fit(comments_vecs, cl) ids = model_fdr.get_support(indices=True) for i in ids: terms[comments_dictionary[i]] += model_fdr.scores_[i] print('Saving results...') with open(FLAGS.chi2file, 'wb') as f: pickle.dump(terms, f, protocol=pickle.HIGHEST_PROTOCOL)
class_names=list(np.unique(Y)) class_num=0 number_of_classes=np.unique(Y).shape[0] for classes in np.unique(Y): y[Y==classes]=int(class_num) print('Class '+ classes + ': ' + str(class_num)) class_num=class_num+1 X = StandardScaler().fit_transform(X) #### for anaova #X = MinMaxScaler().fit_transform(X) #### for Chi2 ## Select features fdr = SelectFdr(f_classif,alpha=0.005) #### for anaova #fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2 X_sel = fdr.fit_transform(X,y) idx_sorted = fdr.get_support(indices = True) fdr_select_features = list( feature_set[i] for i in idx_sorted) print ('Selected features with FDR: ') print (fdr_select_features) print ('\n') print (X.shape) print (X_sel.shape) X_new = df[fdr_select_features].values Y=(df['Class']) le = preprocessing.LabelEncoder() y=le.fit_transform(Y)
def select_features(df, target, featsel_runs=3, max_it=150, w_thr=1e-4, keep=None, n_jobs=1, verbose=0): """ Inputs: - df: nxp pandas DataFrame with n data points and p features; to avoid overfitting, only provide data belonging to the n training data points. - target: n dimensional array with targets corresponding to the data points in df - featsel_runs: number of times to perform in the feature selection part with a random fraction of data points (int; default: 3) - max_it: how many iterations will be performed at most (int; default: 150) - w_thr: threshold on the final Lasso model weights to filter the features (float; default: 1e-4) - keep: list of features that should be kept no matter what - n_jobs: how many jobs to run when selecting the features in parallel (int; default: 1) - verbose: verbosity level (int; default: 0) Returns: - good_cols: list of column names for df with which a regression model can be trained """ if not (len(df) == len(target)): raise ValueError("[featsel] df and target dimension mismatch.") if keep is None: keep = [] # scale features to have 0 mean and unit std if verbose > 0: if featsel_runs > df.shape[0]: print("[featsel] WARNING: Less data points than featsel runs!!") print("[featsel] Scaling data...", end="") scaler = StandardScaler() with warnings.catch_warnings(): warnings.simplefilter("ignore") df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, dtype=np.float32) target_scaled = scaler.fit_transform(target.reshape(-1, 1)).ravel() if verbose > 0: print("done.") # quick and dirty univariate filtering with warnings.catch_warnings(): warnings.simplefilter("ignore") fsel = SelectFdr(f_regression, alpha=0.1).fit(df_scaled, target_scaled) cols = keep + [ df_scaled.columns[i] for i in fsel.get_support(True) if df_scaled.columns[i] not in keep ] if cols: df_scaled = df_scaled[cols] if verbose > 0: print("[featsel] %i/%i features after univariate filtering" % (len(df_scaled.columns), len(fsel.get_support()))) # select good features in k runs in parallel # by doing sort of a cross-validation (i.e., randomly subsample data points) def run_select_features(i): if verbose > 0: print("[featsel] Feature selection run %i/%i" % (i + 1, featsel_runs)) np.random.seed(i) rand_idx = np.random.permutation( df_scaled.index)[:max(10, int(0.8 * len(df_scaled)))] return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], max_it=max_it, eps=1e-8, verbose=verbose - 1) good_cols = [c for c in keep] if featsel_runs >= 1: if n_jobs == 1: # only use parallelization code if you actually parallelize selected_columns = [] for i in range(featsel_runs): selected_columns.extend(run_select_features(i)) else: def flatten_lists(l): return [item for sublist in l for item in sublist] selected_columns = flatten_lists( Parallel(n_jobs=n_jobs, verbose=100 * verbose)(delayed(run_select_features)(i) for i in range(featsel_runs))) if len(selected_columns) > 1: selected_columns = Counter(selected_columns) selected_columns = sorted(selected_columns, key=selected_columns.get, reverse=True) selected_columns = keep + [ c for c in selected_columns if c not in keep ] if verbose > 0: print("[featsel] %i features after %i feature selection runs" % (len(selected_columns), featsel_runs)) correlations = df_scaled[selected_columns].corr() if not keep: good_cols.append(selected_columns[0]) k = 1 else: k = len(keep) for i, c in enumerate(selected_columns[k:], k): # only take features that are somewhat uncorrelated with the rest if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9: good_cols.append(c) if verbose > 0: print("[featsel] %i features after correlation filtering" % len(good_cols)) else: good_cols += selected_columns if not good_cols: good_cols = list(df.columns) # perform recursive feature elimination on these features df_scaled = df_scaled[good_cols] X = df_scaled.to_numpy() if df_scaled.shape[0] > 50: rand_noise = np.random.permutation(X.flatten()).reshape(X.shape) X = np.hstack([X, rand_noise]) rand_noise = np.random.randn(df.shape[0], max(3, int(0.5 * len(good_cols)))) X = np.hstack([X, rand_noise]) with warnings.catch_warnings(): warnings.simplefilter("ignore") reg = lm.LassoLarsCV(eps=1e-16) reg.fit(X, target) weights = dict(zip(good_cols, reg.coef_[:len(good_cols)])) # only include features that are more important than our known noise features noise_w_thr = np.max(np.abs(reg.coef_[len(good_cols):])) good_cols = [c for c in weights if abs(weights[c]) > noise_w_thr] if verbose > 0: print("[featsel] %i features after noise filtering" % len(good_cols)) if not good_cols: if verbose > 0: print("[featsel] WARNING: Not a single good features was found...") return keep # train again a regression model, but this time on the original (unscaled) data df = df[good_cols] X = df.to_numpy() with warnings.catch_warnings(): warnings.simplefilter("ignore") reg = lm.LassoLarsCV(eps=1e-16) reg.fit(X, target) # alphas in CV are generally chosen a bit too small reg = lm.LassoLars(alpha=1.5 * reg.alpha_, eps=1e-16) reg.fit(X, target) weights = dict(zip(list(df.columns), reg.coef_)) good_cols = [ c for c in sorted(weights, key=lambda x: abs(weights[x]), reverse=True) if abs(weights[c] * df[c].std()) >= w_thr ] # add keep columns back in good_cols = keep + [c for c in good_cols if c not in keep] if verbose > 0: if not good_cols: print("[featsel] WARNING: Not a single good features was found...") print( "[featsel] %i final features selected (including %i original keep features)." % (len(good_cols), len(keep))) return good_cols
###### MASKING FOR SELECTED STIMS targetNames = ['bottle', 'face', 'scissors'] # the ttims of interest stimMask = targetData.labels.isin(targetNames) # indices for the stim of interest X_fMRI_selected = X_fMRI[stimMask] # features (for selected stimuli only) y = np.array(targetData.labelInd)[stimMask] # labels ###### FEATURE SELECTION # FDR feature selector selector = SelectFdr(f_classif, alpha=0.01) # FDR selector object selector.fit(X_fMRI_selected, y) # learning from the data X = selector.transform(X_fMRI_selected) # Selected features only indVoxels = selector.get_support(indices=True) # indices of surviving voxels ###### VISUALIZING FEATURE LOCATIONS # binary vector with 1s indicating selected voxels bROI = np.zeros(X_fMRI.shape[-1]) bROI[indVoxels] = 1 # reverse masking bROI_img = masker.inverse_transform(bROI) # Create the figure plot_stat_map(bROI_img, imgAnat, title='Voxels surviving FDR') ##### SVM CLASSIFICATION (LINEAR WITH C=1)
def remove_drugs_with_low_effect_univariate( feat, meta, threshold=0.05, fdr=0.05, test_each_dose=False, keep_names=['DMSO', 'NoCompound'], return_nonsignificant=False, drugname_column = 'drug_type', drugdose_column = 'drug_dose' ): """ Remove drugs when the number of features significantly different to DMSO for any dose is lower than the threshold. The statistical significance of the difference between a compound dose and the DMSO is assessed based on individual ANOVA tests for each feature. The Benjamini-Hochberg method is used to control the false discovery rate. param: feat : dataframe feature dataframe meta : dtaframe dataframe with metadata threshold : float < 1.0 and < 0.0 percentage of significant features detected to consider that the compound has significant effect fdr : float < 1.0 and > 0.0 false discovery rate parameter in Benjamini-Hochberg method test_each_dose : bool, optional If true, each dose of each drug is tested for statistical significance compared to DMSO, and the drug is considered to have a significant effect if any of the doses satisfies the conditions set by the fdr and threshold parameters. If False, an ANOVA test is performed comparing the DMSO with all the doses (as separate classes) and the conditions are checked once for each drug. keep_names : list or None, optional list of names from the drugname_column to keep without checking for significance return_nonsignificant : bool, optional return the names of the drugs that are removed from the dataset drugname_column : string the name of the column in meta that contains the individual compound names drugdose_column : string the name of the column in meta that contains the drug doses return: feat = feature dataframe with low-potency drugs removed samples = dataframe with sample identification data corresponding to returned feat dataframe """ import numpy as np from sklearn.feature_selection import SelectFdr, f_classif import pdb n_feat = feat.shape[1] drug_names = meta[drugname_column].unique() significant_drugs = [] for idrug,drug in enumerate(drug_names): if drug in keep_names: continue # For each dose get significant features using Benjamini-Hochberg # method with FDR=fdr X = feat[meta[drugname_column].isin([drug,'DMSO'])] y = meta.loc[meta[drugname_column].isin([drug,'DMSO']), drugdose_column] selector = SelectFdr(score_func=f_classif, alpha=fdr) if not test_each_dose: try: selector.fit(X, y) except ValueError: pdb.set_trace() n_sign_feat = np.sum(selector.get_support()) if n_sign_feat > threshold*n_feat: significant_drugs.append(drug) else: n_sign_feat = [] for idose, dose in enumerate(y.unique()): if dose == 0: continue selector.fit(X[np.isin(y,[0,dose])], y[np.isin(y,[0,dose])]) n_sign_feat.append(np.sum(selector.get_support())) if np.any([n>threshold*n_feat for n in n_sign_feat]): significant_drugs.append(drug) # If DMSO was in drug list, include it in the final dataframe # (default behaviour) if keep_names is not None: significant_drugs.extend(keep_names) feat = feat[meta[drugname_column].isin(significant_drugs)] meta = meta[meta[drugname_column].isin(significant_drugs)] if return_nonsignificant: return feat, meta, list( set(drug_names).difference(set(significant_drugs)) ) else: return feat, meta