def test_surf_pandas_inputs(): """Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7
def test_surf_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surf_pipeline(): """Ensure that SURF works in a sklearn pipeline when it is parallelized""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surf_pipeline_mixed_attributes(): """Check: Data (Mixed Attributes): SURF works in a sklearn pipeline""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
def test_surf_pipeline_cont_endpoint(): """Check: Data (Continuous Endpoint): SURF works in a sklearn pipeline""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
def test_surf_pipeline_missing_values(): """Ensure that SURF works in a sklearn pipeline with missing values""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
def test_surf_init(): """Check: SURF, SURF*, and MultiSURF constructors store custom values correctly""" clf = SURF(n_features_to_select=7, discrete_threshold=20, verbose=True, n_jobs=3) assert clf.n_features_to_select == 7 assert clf.discrete_threshold == 20 assert clf.verbose == True assert clf.n_jobs == 3
def test_surf_pipeline_multiclass(): """Check: Data (Multiclass Endpoint): SURF works in a sklearn pipeline""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_multiclass, labels_multiclass, cv=3, n_jobs=-1)) > 0.7
def get_selector(name, estimator=None, n_features_to_select=None, **params): if name == 'RobustSelector': return RobustSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('cv', 'verbose'))) elif name == 'MaxFeatures': return SelectFromModel(estimator, threshold=-np.inf, max_features=n_features_to_select) elif name == 'RandomSubsetSelector': return RandomSubsetSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('n_subsets', 'subset_size', 'random_state'))) elif name == 'FeatureImportanceThreshold': return SelectFromModel(estimator, **search_dict(params, 'threshold')) elif name == 'RFE': return RFE(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('step', 'verbose'))) elif name == 'RFECV': return RFECV(estimator, n_features_to_select=n_features_to_select, **search_dict(params, ('step', 'cv', 'verbose'))) elif name == 'FoldChangeFilter': return FoldChangeFilter(**search_dict(params, ('threshold', 'direction', 'below', 'pseudo_count'))) elif name == 'ZeroFractionFilter': return ZeroFractionFilter(**search_dict(params, ('threshold',))) elif name == 'RpkmFilter': return RpkmFilter(**search_dict(params, ('threshold',))) elif name == 'RpmFilter': return RpmFilter(**search_dict(params, ('threshold',))) elif name == 'DiffExpFilter': return DiffExpFilter(max_features=n_features_to_select, **search_dict(params, ('threshold', 'script', 'temp_dir', 'score_type', 'method'))) elif name == 'ReliefF': from skrebate import ReliefF return ReliefF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'n_neighbors', 'discrete_limit'))) elif name == 'SURF': from skrebate import SURF return SURF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'discrete_limit'))) elif name == 'MultiSURF': from skrebate import MultiSURF return MultiSURF(n_features_to_select=n_features_to_select, **search_dict(params, ('n_jobs', 'discrete_limit'))) elif name == 'SIS': return SIS(n_features_to_select=n_features_to_select, **search_dict(params, ('temp_dir', 'sis_params'))) elif name == 'NullSelector': return NullSelector() else: raise ValueError('unknown selector: {}'.format(name))
GradientBoostingClassifier(random_state=args.randomseed), 'rf': RandomForestClassifier(n_jobs=-1, random_state=args.randomseed), 'ext': ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=args.randomseed), 'svm': SVC(kernel='sigmoid', random_state=args.randomseed) } # 初始化 fs 字典 fs = { 'ReliefF': ReliefF(n_features_to_select=100, verbose=False, n_jobs=-1), # 'TuRF': TuRF(core_algorithm="ReliefF", n_features_to_select=100, verbose=False, n_jobs=-1), 'SURF': SURF(n_features_to_select=100, verbose=False, n_jobs=-1), 'SURFstar': SURFstar(n_features_to_select=100, verbose=False, n_jobs=-1) } print('\nClassifier parameters:', clf[args.classifier]) print('\nStarting cross validating without feature selection...\n') # 特征排序前的增量特征预测,根据名称调用字典中指定分类器 y_pred_list = [ cross_val_predict(clf[args.classifier], X[:, 0:i + 1], y, cv=args.kfolds,
def relief(X, y): np.random.seed(0) return SURF().fit(X, y).feature_importances_
def relief(X, y): return SURF().fit(X, y).feature_importances_
def FeatureWeights(self, weights=("pearson", "variance"), **kwargs): """ Calculates the requested weights and log them :param weights: a list of weights, a subset of {'pearson', 'variance', 'relieff', 'surf', 'sobol', 'morris', 'delta_mmnt', 'info-gain'} :param kwargs: all input acceptable by ``skrebate.ReliefF``, ``skrebate.surf``, ``sensapprx.SensAprx`` :return: None """ from pandas import DataFrame, read_sql self.data = read_sql("SELECT * FROM data", self.conn) features = list(self.data.columns) features.remove(self.target) weights_df = read_sql("SELECT * FROM weights", self.conn) if len(weights_df) == 0: weights_df = DataFrame({"feature": features}) X = self.data[features].values y = self.data[self.target].values n_features = kwargs.get("n_features", int(len(features) / 2)) domain = None probs = None regressor = kwargs.get("regressor", None) reduce = kwargs.get("reduce", True) num_smpl = kwargs.get("num_smpl", 700) W = {"feature": features} for factor in weights: if factor == "pearson": Res = dict(self.data.corr(method="pearson").fillna(0)[self.target]) W["pearson"] = [Res[v] for v in features] elif factor == "variance": Res = dict(self.data.var()) W["variance"] = [Res[v] for v in features] elif factor == "relieff": from skrebate import ReliefF n_neighbors = kwargs.get("n_neighbors", 80) RF = ReliefF(n_features_to_select=n_features, n_neighbors=n_neighbors) RF.fit(X, y) W["relieff"] = [ RF.feature_importances_[features.index(v)] for v in features ] elif factor == "surf": from skrebate import SURF RF = SURF(n_features_to_select=n_features) RF.fit(X, y) W["surf"] = [ RF.feature_importances_[features.index(v)] for v in features ] elif factor == "sobol": from .sensapprx import SensAprx SF = SensAprx( method="sobol", domain=domain, probs=probs, regressor=regressor, reduce=reduce, num_smpl=num_smpl, ) SF.fit(X, y) domain = SF.domain probs = SF.probs W["sobol"] = [SF.weights_[features.index(v)] for v in features] elif factor == "morris": from .sensapprx import SensAprx SF = SensAprx( method="morris", domain=domain, probs=probs, regressor=regressor, reduce=reduce, num_smpl=num_smpl, ) SF.fit(X, y) domain = SF.domain probs = SF.probs W["morris"] = [SF.weights_[features.index(v)] for v in features] elif factor == "delta-mmnt": from .sensapprx import SensAprx SF = SensAprx( method="delta-mmnt", domain=domain, probs=probs, regressor=regressor, reduce=reduce, num_smpl=num_smpl, ) SF.fit(X, y) domain = SF.domain probs = SF.probs W["delta_mmnt"] = [SF.weights_[features.index(v)] for v in features] elif factor == "info-gain": from sklearn.feature_selection import mutual_info_classif Res = mutual_info_classif(X, y, discrete_features=True) W["info_gain"] = [Res[features.index(v)] for v in features] new_w_df = DataFrame(W) merged = weights_df.merge(new_w_df, on="feature") merged.fillna(0.0) merged.to_sql("weights", self.conn, if_exists="replace", index=False)
def rank_features_by_rebate_methods(data_split_list, fs_method, iterate, remove_percent=0.1, verbose=False): ## 0. Input arguments: # data_split_list: data frame that contains the learning data # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar' # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large) # remove_percent: percentage of features removed at each iteration (only applied when iterate = True) # verbose: whether to show progress by each fold: True of False ## 1. Define function for feature ranking method # SURF if fs_method == 'SURF': # Implement TURF extension when 'iterate == True' if iterate == True: fs = TuRF(core_algorithm='SURF', pct=remove_percent) else: fs = SURF() # SURFstar if fs_method == 'SURFstar': if iterate == True: fs = TuRF(core_algorithm='SURFstar', pct=remove_percent) else: fs = SURFstar() # MultiSURF if fs_method == 'MultiSURF': if iterate == True: fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent) else: fs = MultiSURF() # MultiSURFstar if fs_method == 'MultiSURFstar': if iterate == True: fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent) else: fs = MultiSURFstar() ## 2. Perform feature ranking on each fold of training data # iterate by folds feat_impt_dict = {} for i in range(0, len(data_split_list)): # intermediate output if verbose == True: print('Computing feature importance scores using data from fold ' + str(i) + '\n') # obtain training feature matrix and response vector feat_train, label_train, _, _ = data_split_list[i] # fit feature ranking model using the specified method if iterate == True: fs.fit(feat_train.values, label_train.values, list(feat_train)) else: fs.fit(feat_train.values, label_train.values) # output feature importance scores in a data frame fold_name = 'Fold_' + str(i) feat_impt_dict[fold_name] = fs.feature_importances_ # aggregate results from muliple folds into one data frame feat_impt_df = pd.DataFrame(feat_impt_dict) feat_impt_df.index = feat_train.columns return feat_impt_df