def test_surfstar_pandas_inputs(): """Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7
def test_surfstar_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surfstar_pipeline(): """Ensure that SURF* works in a sklearn pipeline when it is parallelized""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surfstar_pipeline_mixed_attributes(): """Check: Data (Mixed Attributes): SURF* works in a sklearn pipeline""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
def test_surfstar_pipeline_cont_endpoint(): """Check: Data (Continuous Endpoint): SURF* works in a sklearn pipeline""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
def test_surfstar_pipeline_missing_values(): """Ensure that SURF* works in a sklearn pipeline with missing values""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
def test_surfstar_pipeline_multiclass(): """Check: Data (Multiclass Endpoint): SURF* works in a sklearn pipeline""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_multiclass, labels_multiclass, cv=3, n_jobs=-1)) > 0.7
RandomForestClassifier(n_jobs=-1, random_state=args.randomseed), 'ext': ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=args.randomseed), 'svm': SVC(kernel='sigmoid', random_state=args.randomseed) } # 初始化 fs 字典 fs = { 'ReliefF': ReliefF(n_features_to_select=100, verbose=False, n_jobs=-1), # 'TuRF': TuRF(core_algorithm="ReliefF", n_features_to_select=100, verbose=False, n_jobs=-1), 'SURF': SURF(n_features_to_select=100, verbose=False, n_jobs=-1), 'SURFstar': SURFstar(n_features_to_select=100, verbose=False, n_jobs=-1) } print('\nClassifier parameters:', clf[args.classifier]) print('\nStarting cross validating without feature selection...\n') # 特征排序前的增量特征预测,根据名称调用字典中指定分类器 y_pred_list = [ cross_val_predict(clf[args.classifier], X[:, 0:i + 1], y, cv=args.kfolds, n_jobs=-1) for i in trange(0, X.shape[1]) ]
def rank_features_by_rebate_methods(data_split_list, fs_method, iterate, remove_percent=0.1, verbose=False): ## 0. Input arguments: # data_split_list: data frame that contains the learning data # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar' # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large) # remove_percent: percentage of features removed at each iteration (only applied when iterate = True) # verbose: whether to show progress by each fold: True of False ## 1. Define function for feature ranking method # SURF if fs_method == 'SURF': # Implement TURF extension when 'iterate == True' if iterate == True: fs = TuRF(core_algorithm='SURF', pct=remove_percent) else: fs = SURF() # SURFstar if fs_method == 'SURFstar': if iterate == True: fs = TuRF(core_algorithm='SURFstar', pct=remove_percent) else: fs = SURFstar() # MultiSURF if fs_method == 'MultiSURF': if iterate == True: fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent) else: fs = MultiSURF() # MultiSURFstar if fs_method == 'MultiSURFstar': if iterate == True: fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent) else: fs = MultiSURFstar() ## 2. Perform feature ranking on each fold of training data # iterate by folds feat_impt_dict = {} for i in range(0, len(data_split_list)): # intermediate output if verbose == True: print('Computing feature importance scores using data from fold ' + str(i) + '\n') # obtain training feature matrix and response vector feat_train, label_train, _, _ = data_split_list[i] # fit feature ranking model using the specified method if iterate == True: fs.fit(feat_train.values, label_train.values, list(feat_train)) else: fs.fit(feat_train.values, label_train.values) # output feature importance scores in a data frame fold_name = 'Fold_' + str(i) feat_impt_dict[fold_name] = fs.feature_importances_ # aggregate results from muliple folds into one data frame feat_impt_df = pd.DataFrame(feat_impt_dict) feat_impt_df.index = feat_train.columns return feat_impt_df