def fs_continuous(X, y, method): """ All 4 methods are implemented, but for Boruta and MIFS the method is over- riden and set to L1. """ n, p = X.shape if method == 'Boruta': rf = RandomForestRegressor(n_jobs=-1) Boruta = boruta.BorutaPy(rf, n_estimators='auto') Boruta.fit(X, y) selected = np.where(Boruta.support_)[0] elif method == 'JMI': MIFS = mifs.MutualInformationFeatureSelector(method='JMI', categorical=False) MIFS.fit(X, y) selected = np.where(MIFS.support_)[0] elif method == 'L1': lasso = LassoCV(n_jobs=-1, normalize=False) sfm = SelectFromModel(lasso) sfm.fit(X, y) selected = sfm.transform(np.arange(p).reshape(1, -1))[0] elif method == 'FDR': FDR = fs.SelectFdr(fs.f_regression, .05) FDR.fit(X, y) selected = FDR.transform(np.arange(p).reshape(1, -1))[0] return selected
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def do_fs(X, y, method): s, f = X.shape y_test = np.arange(f).reshape(1, -1) if method == "fdr": sel = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0] elif method == "l1svc": sel = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) elif method == "boruta": rf = RandomForestClassifier(n_jobs=-1) b = boruta.BorutaPy(rf, n_estimators='auto') b.fit(X, y) sel = np.where(b.support_)[0] elif method == "jmi": MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) sel = np.where(MIFS.support_)[0] return sel
def fs_categorical(X, y, method): n, p = X.shape selected = [] if method == 'Boruta': rf = RandomForestClassifier(n_jobs=-1) Boruta = boruta.BorutaPy(rf, n_estimators='auto') Boruta.fit(X, y) selected = np.where(Boruta.support_)[0] elif method == 'JMI': MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) selected = np.where(MIFS.support_)[0] elif method == 'L1': selected = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) elif method == 'FDR': FDR = fs.SelectFdr(fs.f_classif, .05) FDR.fit(X, y) selected = FDR.transform(np.arange(p).reshape(1, -1))[0] return selected
def select_best(): df = pd.merge( acw.gen_long_data(tpt) .normalize(columns="metric") .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE)) .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index() .rename(columns={"metric": "acw"}), acz.gen_long_data(tpt) .normalize(columns="metric") .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE)) .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index() .rename(columns={"metric": "acz"}), on=["task", "subject", "region", "net_meta"], sort=False).and_filter(NOTnet_meta="M") X = df.iloc[:, -2:].values y = df.net_meta.map({"C": 0, "P": 1}).values functions = [fs.mutual_info_classif, fs.f_classif, fs.chi2] for func in functions: for method in [fs.SelectKBest(func, k=1), fs.SelectPercentile(func), fs.SelectFdr(func), fs.SelectFpr(func), fs.SelectFwe(func)]: method.fit(X, y) print(f'{str(method).split("(")[0]} {func.__name__}: {np.argmax(method.scores_) + 1}')
except: if _sklearn_ver > 17: raise _feature_selectors = [] _feature_selectors.append((feature_selection.SelectKBest(k=1), pd_feature_selection.SelectKBest(k=1), True)) _feature_selectors.append( (feature_selection.SelectKBest(k=1), pickle.loads(pickle.dumps(pd_feature_selection.SelectKBest(k=1))), True)) _feature_selectors.append((feature_selection.SelectKBest(k=2), pd_feature_selection.SelectKBest(k=2), True)) _feature_selectors.append((feature_selection.SelectPercentile(), pd_feature_selection.SelectPercentile(), True)) _feature_selectors.append( (feature_selection.SelectFdr(), pd_feature_selection.SelectFdr(), True)) _feature_selectors.append( (feature_selection.SelectFwe(), pd_feature_selection.SelectFwe(), True)) # Tmp Ami if False: _feature_selectors.append( (feature_selection.RFE(linear_model.LogisticRegression()), pd_feature_selection.RFE(pd_linear_model.LogisticRegression()), True)) _keras_estimators = [] if _level > 0: _keras_estimators.append( (KerasClassifier(_build_classifier_nn, verbose=0), PdKerasClassifier(_build_classifier_nn, _load_iris()[0]['class'].unique(), verbose=0), False))
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
def do_fs(X, y): s, f = X.shape y_test = np.arange(f).reshape(1, -1) # -------------------------------------------------------------- # UNIVARIATE FEATURE SELECTION # percentile - take the top10% of features sel_uni_perc = fs.SelectPercentile(fs.f_classif, 10).fit(X, y).transform(y_test)[0] # fdr - minimize false discovery rate at alpha = .05 sel_uni_fdr = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0] # -------------------------------------------------------------- # RFECV # do a cross-validated grid search for the optimal C gridC = {'C': np.logspace(-6, 3, 10)} svc = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4) grid_cv = GridSearchCV(svc, gridC, scoring='accuracy', n_jobs=-1) grid_cv.fit(X, y) # set the optimal C # adjust for the smaller training sample size, due to cross validation # http://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html cv_num = 3 train_size = 1 - 1 / float(cv_num) adjust_c = float(s * train_size) svc.set_params(C=grid_cv.best_params_['C'] * adjust_c) # do a stratified 3 fold cross-validated recursive feature elimination, # with 1% of the worst feautres removed each round rfecv = fs.RFECV(estimator=svc, step=.01, cv=cv_num, scoring='accuracy') rfecv.fit(X, y) sel_rfecv = rfecv.transform(y_test)[0] # -------------------------------------------------------------- # L1 SVC sel_lsvc = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) # -------------------------------------------------------------- # STABILITY SELECTION rlr = RandomizedLogisticRegression(n_resampling=1000, C=np.logspace(-2, 2, 5), selection_threshold=0.7, sample_fraction=0.5) sel_rlr = rlr.fit(X, y).transform(y_test)[0] # -------------------------------------------------------------- # BORUTA rf = RandomForestClassifier(n_jobs=-1) b = boruta.BorutaPy(rf, n_estimators='auto') b.fit(X, y) sel_b_rf = np.where(b.support_)[0] # -------------------------------------------------------------- # JMI MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) sel_jmi = np.where(MIFS.support_)[0] return (sel_uni_perc, sel_uni_fdr, sel_rfecv, sel_lsvc, sel_rlr, sel_b_rf, sel_jmi)