def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int = 50, max_iter: int = 100, perc: int = 75, learning_rate: float = 0.01, verbosity: int = -1, seed: int = 1, max_depth: int = -1, random_state: int = 1, verbose: int = 2) -> List[str]: feat_estimator = LGBMFeatureEstimator( { "objective": "regression" if mode == "regression" else "binary", "metric": "rmse" if mode == "regression" else "auc", "learning_rate": learning_rate, "verbosity": verbosity, "seed": seed, "max_depth": max_depth, }, n_estimators) feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=verbose, random_state=random_state, perc=perc) try: feat_selector.fit(X.values, y.values.ravel()) except TypeError: pass return X.columns[feat_selector.support_].tolist()
def boruta_selection(x_train, y_train, seed): """Kursa, M., Rudnicki, W., “Feature, Selection with the Boruta Package” Journal of Statistical Software, Vol.36, Issue 11, Sep 2010""" # define random forest classifier, with utilising all cores and # sampling in proportion to y labels rf = RandomForestRegressor(n_jobs=-1, oob_score=True) feat_selector = BorutaPy( rf, n_estimators="auto", max_iter=100, alpha=0.05, verbose=2, random_state=seed, ) # find all relevant features - 5 features should be selected # print(self.x_train.head()) # print(self.y_train.head()) selector = feat_selector.fit(np.asarray(x_train), np.asarray(y_train)) # print("selector", selector) # print(selector.support_) # print(selector.ranking_) # self.x_train = self.x_train.loc[:, feat_selector.support_].astype("float") # self.x_test = self.x_test.loc[:, feat_selector.support_].astype("float") # self.features[target] = self.y_train['Response'] # print("New DF Shape: ", self.x_train.shape) # print(self.x_train) # self.x_train = self.features # print("New DataFrame Columns:\n", self.x_train.columns) # print('Length of new_df: \n', len(self.x_train)) # print(self.x_train.dtypes) return selector.support_
def get_boruta(X, y): """ Returns the features selected by Boruta algorithm for the passed dataset :param X: Numpy array of features :param y: Numpy array of target feature """ from boruta import BorutaPy from sklearn.ensemble import RandomForestRegressor import numpy as np # Initialize Boruta forest = RandomForestRegressor(n_jobs=-1, max_depth=5) boruta = BorutaPy( estimator=forest, n_estimators='auto', max_iter=100 # number of trials to perform ) # fit Boruta (it accepts np.array, not pd.DataFrame) boruta.fit(np.array(X), np.array(y)) # print results green_area = X.columns[boruta.support_].to_list() blue_area = X.columns[boruta.support_weak_].to_list() print('features in the green area:', green_area) print('features in the blue area:', blue_area) print('features ranking :', boruta._rankings) return boruta
def test_if_boruta_extracts_relevant_features(self): np.random.seed(42) y = np.random.binomial(1, 0.5, 1000) X = np.zeros((1000, 10)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 # 5 relevant features X[:, 0] = z X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000) X[:, 2] = y + np.random.normal(0, 1, 1000) X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000) X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) # 5 irrelevant features X[:, 5] = np.random.normal(0, 1, 1000) X[:, 6] = np.random.poisson(1, 1000) X[:, 7] = np.random.binomial(1, 0.3, 1000) X[:, 8] = np.random.normal(0, 1, 1000) X[:, 9] = np.random.poisson(1, 1000) rfc = RandomForestClassifier() bt = BorutaPy(rfc) bt.fit(X, y) # make sure that only all the relevant features are returned self.assertItemsEqual(range(5), list(np.where(bt.support_)[0]))
def run(self): print("Here : ") df_dummies = pd.read_csv( data_transformation().output()['output1'].path) X_all = pd.read_csv(data_transformation().output()['output2'].path) y_all = pd.read_csv(data_transformation().output()['output3'].path) X_boruta = X_all.values y_boruta = y_all.values y_boruta = np.insert(y_boruta, 7031, 'NO') # Define random forest classifier, with utilising all cores and sampling in proportion to y labels rfc = RandomForestClassifier(n_jobs=-1) # Define Boruta feature selection method feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1) # Find all relevant features feature_selector.fit(X_boruta, y_boruta) #Transposing dataframe for ranking df_features_rank = df_dummies.drop(['Churn'], axis=1).T # Check ranking of features df_features_rank['Boruta_Rank'] = feature_selector.ranking_ # Adding a variable 'Feature' in the dataframe df_features_rank['Feature'] = df_features_rank.index # Sort the dataframe as per Rank df_features_rank = df_features_rank.sort_values('Boruta_Rank') # Exctracting only top 2 ranked features df_top2_ranked_feature = df_features_rank.loc[ df_features_rank['Boruta_Rank'].isin([1, 2])] # Selecting important featutres selected_features = df_top2_ranked_feature.index X_selected = df_dummies[selected_features] y_selected = df_dummies["Churn"] print(self.output()) X_selected.to_csv(self.output()['output1'].path, index=False) y_selected.to_csv(self.output()['output2'].path, index=False)
def by_boruta(data): import numpy as np from sklearn.ensemble import RandomForestClassifier from boruta import BorutaPy y = data.loc[:, 'type'].values y = y.astype(int) X = data.drop(columns=['type']) features = X.columns.to_list() X = X.values X = X.astype(int) rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced') feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2) feat_selector.fit(X, y) df = pd.DataFrame(data={ 'features': features, 'ranking': feat_selector.ranking_ }) #df.columns = [col.strip() for col in list(df.columns)] #print(df.columns.to_list()); df.sort_values(["ranking"], axis="rows", ascending=[False], inplace=True) #print(df.ranking) #print(feat_selector.ranking_) #print(df) top_features = df.features.to_list() return top_features
def Boruta_fs(x_train, y_train): """Perform feature selection using Boruta Arguments: x_train, y_train """ estimator = RandomForestClassifier(n_jobs=-1, random_state=0, class_weight='balanced') selector = BorutaPy(estimator, n_estimators='auto', verbose=2, random_state=0) #perc=100, max_iter=100, two_step=True selector.fit(x_train.values, y_train.values) feature_names = x_train.columns.values df_rank = pd.DataFrame({ 'Rank': selector.ranking_, 'Features': feature_names }) #finding ranked list confirmed_indices = np.where( selector.ranking_ == 1) #saving the confirmed features confirmed_names = x_train.columns.values[confirmed_indices] df_rank_confirmed = pd.DataFrame(confirmed_names) #print confirmed_names df_rank_confirmed.index += 1 return df_rank, df_rank_confirmed
def feature_engineering(X_all, y_all, df_dummies): # Change X and y to its values X_boruta = X_all.values y_boruta = y_all.values # Define random forest classifier, with utilising all cores and sampling in proportion to y labels rfc = RandomForestClassifier(n_jobs=-1) # Define Boruta feature selection method feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1) # Find all relevant features feature_selector.fit(X_boruta, y_boruta) #Transposing dataframe for ranking df_features_rank = df_dummies.drop(['Churn'], axis=1).T # Check ranking of features df_features_rank['Boruta_Rank'] = feature_selector.ranking_ # Adding a variable 'Feature' in the dataframe df_features_rank['Feature'] = df_features_rank.index # Sort the dataframe as per Rank df_features_rank = df_features_rank.sort_values('Boruta_Rank') # Exctracting only top 2 ranked features df_top2_ranked_feature = df_features_rank.loc[ df_features_rank['Boruta_Rank'].isin([1, 2])] # Selecting important featutres selected_features = df_top2_ranked_feature.index X_selected = df_dummies[selected_features] y_selected = df_dummies["Churn"] # Pickle the selected features for Form Uploads upload_featuredIndexFilePath = pickle_df_index(X_selected, 'featured_index_dict.pkl') return X_selected, y_selected, upload_featuredIndexFilePath
class DataTransformerBoruta: def __init__(self, corr_th, n_est=500, seed=123): self.boruta = True rfc = RandomForestClassifier(n_estimators=n_est, class_weight="balanced", n_jobs=6) self.feature_selector = BorutaPy(rfc, n_estimators="auto", verbose=0, random_state=seed, max_iter=100) self.corr_rem = CorrelationRemover(corr_th) def fit_transform(self, X, y): X_arr = np.array(X) y_arr = np.array(y).reshape(-1) self.feature_selector.fit(X_arr, y_arr) X_columns = X.columns selected_columns = X_columns[self.feature_selector.support_] X = X[selected_columns] X = self.corr_rem.fit_transform(X) return X def transform(self, X): X_columns = X.columns selected_columns = X_columns[self.feature_selector.support_] X = X[selected_columns] X = self.corr_rem.transform(X) return X def get_selected_num(self): return self.feature_selector.n_features_ - self.corr_rem.get_removed_num() def get_selected_vec(self, X): col_names = X.columns selected_columns = col_names[self.feature_selector.support_] cor_removed = self.corr_rem.get_removed_vec() selected_columns = set(selected_columns) - set(cor_removed) return (np.array(list(selected_columns)) + 1) # +1 is because we count coumnf from 1.
def boruta_selector(df, y=None): Y = df[y] df = df.drop(y, axis=1) num_feat = df.select_dtypes(include=['int', 'float']).columns.tolist() cat_feat = df.select_dtypes(include=['object']).columns.tolist() pipe_num_tree = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]) pipe_cat_tree = Pipeline( steps=[('imputer', SimpleImputer( strategy='most_frequent')), ('cat_transformer', OrdinalEncoder())]) preprocessor_tree = ColumnTransformer( transformers=[('num_preprocessor', pipe_num_tree, num_feat), ('cat_preprocessor', pipe_cat_tree, cat_feat)]) RF = Pipeline( steps=[('preprocessor_rf', preprocessor_tree), ('model_rf', RandomForestClassifier(random_state=123, max_depth=5))]) X = preprocessor_tree.fit_transform(df) rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) # Criando o boruta feat_selector = BorutaPy(rf, n_estimators='auto', random_state=123, max_iter=100) # 500 iterações até convergir feat_selector.fit(X, Y) # Terceiro filtro com as features selecionadas pelo boruta cols_drop_boruta = [not x for x in feat_selector.support_.tolist() ] # apenas invertendo o vetor de true/false cols_drop_boruta = df.loc[:, cols_drop_boruta].columns.tolist() return cols_drop_boruta
def select_features_by_boruta(X_train, X_test, y_train): model = RandomForestRegressor( n_estimators=50, max_depth=5, max_features='sqrt', n_jobs=-1, verbose=True, random_state=1 ) features_selector = BorutaPy( model, n_estimators='auto', perc=80, verbose=2, two_step=False, max_iter=100, random_state=1 ) features_selector.fit(X_train.values, y_train.values) X_train_selected = X_train.iloc[:, features_selector.support_] X_test_selected = X_test.iloc[:, features_selector.support_] feature_selected_cols = list(X_train_selected.columns) print('Selected features are: ', feature_selected_cols) return feature_selected_cols, X_train_selected, X_test_selected
def main(): df = load_data() # split data inot X and y X = df.drop(["Outcome"], axis=1) y = df["Outcome"] # define random forest classifier, with utilising all cores and # sampling in proportion to y labels clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, class_weight="balanced", max_depth=5) # define Boruta feature selection method feat_selector = BorutaPy(clf, n_estimators="auto", random_state=23) # find all relevant features - 5 features should be selected feat_selector = feat_selector.fit(X.values, y.values) # number of selected features print("\n Number of selected features: ") print(feat_selector.n_features_) # setting up a dataframe for ranking features feature_df = pd.DataFrame(X.columns.tolist(), columns=["features"]) feature_df["rank"] = feat_selector.ranking_ feature_df = feature_df.sort_values("rank", ascending=True).reset_index( drop=True) # noqa print("\n Top %d features:" % feat_selector.n_features_) print(feature_df.head(feat_selector.n_features_)) # save feature ranking on a csv file feature_df.to_csv(data_dir + "boruta-feature-ranking.csv", index=False)
def __init__(self): self.rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1) self.boruta_selector = BorutaPy(self.rfc, n_estimators='auto', random_state=50) self.X = None self.cols = None
def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int = 50, max_iter: int = 50, perc: int = 75) -> List[str]: feat_estimator = LGBMFeatureEstimator( { "objective": "regression" if mode == "regression" else "binary", "metric": "rmse" if mode == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": 7, "min_data_in_leaf": 3, }, n_estimators) feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=2, random_state=1, perc=perc) try: feat_selector.fit(X.values, y.values.ravel()) except: pass return X.columns[feat_selector.support_].tolist()
def fitBorutaRF( ): #Boruta documentation: https://pypi.python.org/pypi/Boruta/0.1.5 print('Feature selection from Boruta RandomForestClassifier: ') rf = RandomForestClassifier(n_jobs=-1, random_state=0, max_depth=5, class_weight='balanced') # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=0) # find all relevant features feat_selector.fit(X_train.values, y_train.values) # check selected features: feat_selector.support_ X_important_train = X_train.iloc[:, feat_selector.support_] X_important_test = X_test.iloc[:, feat_selector.support_] print("Boruta selected features for the model: ", list(X_important_train.columns)) # check ranking of features: feat_selector.ranking_ return X_important_train, X_important_test
def main(): print("Begin Feature Selection Step...") print('-' * 60) print('Loading Data...') df = pd.read_csv("./data/my_midterm_train.csv") y = df['y'] X = df.drop(['y'], axis=1) # define random forest classifier, with utilising all cores and # sampling in proportion to y labels rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5) # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2) print("Fitting Boruta...") # find all relevant features feat_selector.fit(X.values, y) print("Selected Features:") # check selected features print(feat_selector.support_) support = pd.DataFrame(feat_selector.support_) print("Selected Feature Rank:") # check ranking of features print(feat_selector.ranking_) ranking = pd.DataFrame(feat_selector.support_) # call transform() on X to filter it down to selected features print("Transforming X...") X_filtered = X.ix[:, feat_selector.support_] print("Writing Data...") support.to_csv("./work_dir/feature_support.csv", index=False) ranking.to_csv("./work_dir/feature_ranking.csv", index=False) combined_df = pd.concat([X_filtered, y], axis=1) combined_df.to_csv("./data/boruta_filtered_stacked_train.csv", index=False)
def test_if_boruta_extracts_relevant_features(self): np.random.seed(42) y = np.random.binomial(1, 0.5, 1000) X = np.zeros((1000, 10)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial( 1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 # 5 relevant features X[:, 0] = z X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal( 0, 0.1, 1000) X[:, 2] = y + np.random.normal(0, 1, 1000) X[:, 3] = y**2 + np.random.normal(0, 1, 1000) X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) # 5 irrelevant features X[:, 5] = np.random.normal(0, 1, 1000) X[:, 6] = np.random.poisson(1, 1000) X[:, 7] = np.random.binomial(1, 0.3, 1000) X[:, 8] = np.random.normal(0, 1, 1000) X[:, 9] = np.random.poisson(1, 1000) rfc = RandomForestClassifier() bt = BorutaPy(rfc) bt.fit(X, y) # make sure that only all the relevant features are returned self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
def boruta_select(X_df, Y, perc_list=[20], allowed_perc_good=.5, allowed_perc_med=.70, samples=[1], multiclass=False): """ Runs the Boruta selector :param X_df: The X Dataframe that the selector will run on :param Y: The y for the training of the selector :param perc_list: The percentages that boruta will be run on :param allowed_perc_good: How many times does one variable has to beat the random ones :param allowed_perc_med: How many times does one variable has to be tentative :param samples: nothing at this moment, possible expansion into sampling :param multiclass: If problem is multiclass or not :return: first dataframe is if the varible should be used, second is what variables were relevant at each percentage , third is what variables were tentative in each percentage """ use_list = [] y = Y.values.ravel() res_df_good = pd.DataFrame(index=X_df.columns) res_df_med = pd.DataFrame(index=X_df.columns) use_df = pd.DataFrame(index=X_df.columns) if multiclass: params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf', 'bagging_fraction': .8, 'bagging_freq': 1} else: params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf', 'bagging_fraction': .8, 'bagging_freq': 1} rf_bor = lgb.LGBMClassifier(**params_bor) for perc_ in perc_list: print('Starting on {}'.format(perc_)) feat_selector = BorutaPy(rf_bor, n_estimators=100, verbose=0, random_state=None, max_iter=10, perc=perc_) feat_selector.fit(X_df.values, y) if perc_ == perc_list[0]: times_good = (feat_selector.support_) * 1 times_kinda_good = (feat_selector.support_weak_) * 1 else: times_good += (feat_selector.support_) * 1 times_kinda_good += (feat_selector.support_weak_) * 1 res_df_good[str(perc_)] = (feat_selector.support_) * 1 res_df_med[str(perc_)] = (feat_selector.support_weak_) * 1 times_good_max = times_good.max() times_med_max = times_good.max() keep = (((times_good >= allowed_perc_good * times_good_max) | (times_kinda_good >= allowed_perc_med * times_med_max)) & ( times_good + times_kinda_good > 0)) # res_df_good[str(perc_)] = times_good # res_df_med[str(perc_)] = times_kinda_good use_df['use'] = keep # print(times_good_max, sum(keep)) return (use_df, res_df_good, res_df_med)
def Feature_sort(Feat_scale, Label, threads=4): ##通过三种特征选择方法对特征进行排序 ranks = {} ## Univariate feature selection Selector = SelectKBest(f_classif, k='all') Selector.fit_transform(Feat_scale, Label) ranks["Univariate_f"] = np.argsort(Selector.pvalues_) ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling ##从第1900左右起,后续的特征排序得较为可疑。 rlogreg = RandomizedLogisticRegression(n_jobs=1, n_resampling=2000, selection_threshold=0, verbose=False, random_state=0) ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21. ##warnings.warn(msg, category=DeprecationWarning) rlogreg.fit(Feat_scale, Label) ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_)) ## boruta based on randomforest n_jobs=** rf = RandomForestClassifier(random_state=0, n_jobs=threads, max_features='auto') feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0) feat_selector.fit(Feat_scale, Label) ranks["Boruta_f"] = np.argsort(feat_selector.ranking_) return (ranks)
def set_params(self, **params): self.__fitOK = False if 'classifier' in params.keys(): self.classifier = params['classifier'] del params['classifier'] if type(self.classifier) != bool: raise ValueError('classifier flag must a boolean') if 'estimator' in params.keys(): self.estimator = params['estimator'] del params['estimator'] if self.estimator is None and self.classifier: self.estimator = Classifier(modelname='RandomForest') elif self.estimator is None: self.estimator = Regressor(modelname='RandomForest') if not isinstance(self.estimator, Classifier) and self.classifier: raise ValueError('Classifier is required for classifier=True') elif not isinstance(self.estimator, Regressor) and not self.classifier: raise ValueError('Regressor is required for classifier=False') self.__selector = BorutaPy(self.estimator.get_estimator()) for k, v in params.items(): if k not in ['n_estimators', 'perc', 'alpha', 'two_step', 'max_iter', 'random_state', 'verbose']: warnings.warn("Invalid parameter a for feature selector" ". Parameter IGNORED. Check" "the list of available parameters with" "`feature_selector.get_params().keys()`") else: setattr(self.__selector, k, v)
def _boruta(self): self._info(f"Feature importance {self.tag}: Boruta algorithm") model_factory = ModelFactoryRandomForest(self.config, self.datasets, self.model_type) model = model_factory.get() boruta = BorutaPy(model, n_estimators='auto', verbose=2) boruta.fit(self.x_train, self.y_train) return boruta
def boruta_fs(X, y, feat_names): rfc = RandomForestClassifier(n_estimators=10000, n_jobs=4, max_depth=1) boruta = BorutaPy(rfc, n_estimators='auto', verbose=2, max_iter=50) boruta.fit(X, y) results = sorted(zip(boruta.ranking_, feat_names), reverse=False) return [x[1] for x in results]
def find_subsystems_of_interest(studyName, groupsList, geneCounts, level, percentage): """ Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated matrix from MG-RAST analysis page Args: studyName (str): directory (study name) groupsList (list): list of group names level (str): subsystems level at which to run Boruta percentage (int): threshold for Boruta feature selection Returns: None, outputs files with tentative genes/gene families of interest """ numGeneCounts = geneCounts.select_dtypes(include=[np.number]) Y = numGeneCounts.transpose().index.str.split('_').str[0].values samplingDepth = numGeneCounts.sum().median() os.chdir(studyName) for i in range(len(numGeneCounts.columns)): subsampleList = [] if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth: meanSubsample = numGeneCounts[numGeneCounts.columns[i]] else: for j in range(100): sample = subsample_counts( numGeneCounts[numGeneCounts.columns[i]].transpose().values, int(samplingDepth)) subsampleList.insert(j, sample) print("completed 100 subsamples for sample number " + str(i)) meanSubsample = pd.Series(subsampleList).mean() #recodification: setting all values less than 1.01 to zero meanSubsample[meanSubsample < 1.01] = 0 meanSubsample = 100 * meanSubsample / meanSubsample.sum() numGeneCounts[numGeneCounts.columns[i]] = meanSubsample numGeneCounts['level1'] = geneCounts['level1'] numGeneCounts['level2'] = geneCounts['level2'] numGeneCounts['level3'] = geneCounts['level3'] numGeneCounts['function'] = geneCounts['function'] countsLvl = numGeneCounts.groupby(level).sum() groupsDict = dict(enumerate(pd.Series(groupsList).unique())) dictGroups = {y: x for x, y in groupsDict.items()} rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3) X = countsLvl.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(percentage)) feat_selector.fit(X, Y) if len(countsLvl[feat_selector.support_]) > 0: countsLvl[feat_selector.support_].to_csv(str(level) + '_tentative.csv') countsLvl[feat_selector.support_weak_].to_csv( str(level) + '_tentative_weak.csv') os.chdir('..')
def do_boruta(model, X, y, max_iter=500, random_state=42): selector = BorutaPy(clone(model), n_estimators='auto', verbose=0, random_state=random_state, max_iter=max_iter) selector.fit(X.values, y.values) print('do_feat_boruta: Done') return X.columns.values[selector.support_]
def get_boruta_features(est, X, y, mode): if mode == 'regression': rf = RandomForestRegressor(n_estimators=500, random_state=SEED) elif mode == 'classification': rf = RandomForestClassifier(n_estimators=500, random_state=SEED) boruta = BorutaPy(rf, n_estimators='auto') boruta.fit(X, y) X_features = X[:, boruta.support_] return X_features
def cal_boruta(df,target,n=50): y = df[target] X = df.drop([target], axis=1).values y = y.ravel() rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=10) feat_selector = BorutaPy(rf, n_estimators='auto', max_iter=n, verbose=2, random_state=1) feat_selector.fit(X, y) feature_df = pd.DataFrame(df.drop([target], axis=1).columns.tolist(), columns=['features']) feature_df['rank']=feat_selector.ranking_ feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True) return feature_df
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame, name: str, cicli: int) -> BorutaPy: boruta_selector = BorutaPy(RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced', max_depth=5), n_estimators='auto', verbose=2, alpha=0.05, max_iter=cicli, random_state=42) boruta_selector.fit(X.values, y.values.ravel()) return boruta_selector
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame) -> BorutaPy: boruta_selector = BorutaPy( RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced', max_depth=5), n_estimators='auto', verbose=2, alpha=0.05, # p_value max_iter=10, # In practice one would run at least 100-200 times random_state=42) boruta_selector.fit(X.values, y.values.ravel()) return boruta_selector
def __init__(self, number_parent_features, output_dimensions): Transformation.__init__(self, 'Boruta', number_parent_features, output_dimensions=output_dimensions, parent_feature_order_matters=False, parent_feature_repetition_is_allowed=False) #classifier = LogisticRegression(penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=10000) classifier = RandomForestClassifier(class_weight='balanced', max_depth=5) self.model = BorutaPy(classifier, n_estimators='auto', random_state=1)
def __init__(self, classifier=True, estimator=None, **kwargs): """Selects important features using Boruta. Parameters ---------- classifier: bool, default=True Flag indicating classification or regression task. estimator : object, default=None A Classifier or Regressor, with a 'fit' method that returns the feature_importances_ attribute. Important features must correspond to high absolute values in the feature_importances_. If None for classifier=True, then RandomForestClassifier is selected, if None for classifier=False, then RandomForestRegressor is selected. **kwargs : : default = None Parameters of Boruta. Attributes ---------- n_features_ : int The number of selected features. support_ : array of shape [n_features] The mask of selected features - only confirmed ones are True. support_weak_ : array of shape [n_features] The mask of selected tentative features, which haven't gained enough support during the max_iter number of iterations.. ranking_ : array of shape [n_features] The feature ranking, such that ``ranking_[i]`` corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1 and tentative features are assigned rank 2. """ if not _IS_BORUTA_INSTALLED: raise ValueError('Boruta is required for this module') self.classifier = classifier if type(self.classifier) != bool: raise ValueError('classifier flag must a boolean') self.estimator = estimator if self.estimator is None and self.classifier: self.estimator = Classifier(modelname='RandomForest') elif self.estimator is None: self.estimator = Regressor(modelname='RandomForest') if not isinstance(self.estimator, Classifier) and self.classifier: raise ValueError('Classifier is required for classifier=True') elif not isinstance(self.estimator, Regressor) and not self.classifier: raise ValueError('Regressor is required for classifier=False') self.__selector = BorutaPy(self.estimator.get_estimator(), **kwargs) self.__fitOK = False
def __init__( self, problem_type, random_state=None, n_jobs=-1, feature_fraction=0.1, max_depth=5, perc=70, n_estimators="auto", alpha=0.01, max_iter=100, **params, ): self.problem_type = problem_type self.random_state = check_random_state(random_state) self.n_jobs = n_jobs self.fset_ = None self.feature_fraction = feature_fraction self.max_depth = max_depth self.perc = perc self.n_estimators = n_estimators self.alpha = alpha self.max_iter = max_iter num_leaves = 2**max_depth RfModel = get_RF_class(self.problem_type) rfmodel = RfModel( random_state=self.random_state.randint(1e6), n_jobs=self.n_jobs, boosting_type="rf", max_depth=max_depth, num_leaves=num_leaves, feature_fraction=feature_fraction, bagging_fraction=0.632, bagging_freq=1, subsample=None, subsample_freq=None, verbose=-1, colsample_bytree=None, importance_type="gain", **params, ) self.estimator = BorutaPy( rfmodel, verbose=0, random_state=self.random_state, perc=perc, n_estimators=n_estimators, alpha=alpha, max_iter=max_iter, **params, )
def boruta_fit_transform(estimator, X, y): feature_selector = BorutaPy(estimator) X = feature_selector.fit_transform(X, y) return X, feature_selector
def test_get_tree_num(self): rfc = RandomForestClassifier(max_depth=10) bt = BorutaPy(rfc) self.assertEqual(bt._get_tree_num(10),44,"Tree Est. Math Fail") self.assertEqual(bt._get_tree_num(100),141,"Tree Est. Math Fail")