def test_multisurfstar_pandas_inputs(): """Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7
def test_multisurfstar_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_multisurfstar_pipeline(): """Ensure that MultiSURF* works in a sklearn pipeline when it is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_multisurfstar_pipeline_mixed_attributes(): """Check: Data (Mixed Attributes): MultiSURF* works in a sklearn pipeline""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
def test_multisurfstar_pipeline_cont_endpoint(): """Check: Data (Continuous Endpoint): MultiSURF* works in a sklearn pipeline""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
def rebate(df, target, n_features): """ Run the MultiSURF* algorithm on a dataframe, returning the reduced df. Args: df (pandas.DataFrame): A dataframe target (str): The target key (must be present in df) n_features (int): The number of features desired to be returned. Returns: pd.DataFrame The dataframe with fewer features, and no target """ X = df.drop(target, axis=1) y = df[target] rf = MultiSURFstar(n_features_to_select=n_features, n_jobs=-1) matrix = rf.fit_transform(X.values, y.values) feats = [] for c in matrix.T: for f in X.columns.values: if np.array_equal(c, X[f].values) and f not in feats: feats.append(f) return df[feats]
def rank_features_by_rebate_methods(data_split_list, fs_method, iterate, remove_percent=0.1, verbose=False): ## 0. Input arguments: # data_split_list: data frame that contains the learning data # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar' # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large) # remove_percent: percentage of features removed at each iteration (only applied when iterate = True) # verbose: whether to show progress by each fold: True of False ## 1. Define function for feature ranking method # SURF if fs_method == 'SURF': # Implement TURF extension when 'iterate == True' if iterate == True: fs = TuRF(core_algorithm='SURF', pct=remove_percent) else: fs = SURF() # SURFstar if fs_method == 'SURFstar': if iterate == True: fs = TuRF(core_algorithm='SURFstar', pct=remove_percent) else: fs = SURFstar() # MultiSURF if fs_method == 'MultiSURF': if iterate == True: fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent) else: fs = MultiSURF() # MultiSURFstar if fs_method == 'MultiSURFstar': if iterate == True: fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent) else: fs = MultiSURFstar() ## 2. Perform feature ranking on each fold of training data # iterate by folds feat_impt_dict = {} for i in range(0, len(data_split_list)): # intermediate output if verbose == True: print('Computing feature importance scores using data from fold ' + str(i) + '\n') # obtain training feature matrix and response vector feat_train, label_train, _, _ = data_split_list[i] # fit feature ranking model using the specified method if iterate == True: fs.fit(feat_train.values, label_train.values, list(feat_train)) else: fs.fit(feat_train.values, label_train.values) # output feature importance scores in a data frame fold_name = 'Fold_' + str(i) feat_impt_dict[fold_name] = fs.feature_importances_ # aggregate results from muliple folds into one data frame feat_impt_df = pd.DataFrame(feat_impt_dict) feat_impt_df.index = feat_train.columns return feat_impt_df