Beispiel #1
0
def select_features(X: pd.DataFrame,
                    y: pd.Series,
                    mode: str,
                    n_estimators: int = 50,
                    max_iter: int = 100,
                    perc: int = 75,
                    learning_rate: float = 0.01,
                    verbosity: int = -1,
                    seed: int = 1,
                    max_depth: int = -1,
                    random_state: int = 1,
                    verbose: int = 2) -> List[str]:
    feat_estimator = LGBMFeatureEstimator(
        {
            "objective": "regression" if mode == "regression" else "binary",
            "metric": "rmse" if mode == "regression" else "auc",
            "learning_rate": learning_rate,
            "verbosity": verbosity,
            "seed": seed,
            "max_depth": max_depth,
        }, n_estimators)

    feat_selector = BorutaPy(feat_estimator,
                             n_estimators=n_estimators,
                             max_iter=max_iter,
                             verbose=verbose,
                             random_state=random_state,
                             perc=perc)

    try:
        feat_selector.fit(X.values, y.values.ravel())
    except TypeError:
        pass

    return X.columns[feat_selector.support_].tolist()
Beispiel #2
0
    def test_if_boruta_extracts_relevant_features(self):
        np.random.seed(42)
        y = np.random.binomial(1, 0.5, 1000)
        X = np.zeros((1000, 10))

        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(
            1, 0.1, 1000)
        z[z == -1] = 0
        z[z == 2] = 1

        # 5 relevant features
        X[:, 0] = z
        X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(
            0, 0.1, 1000)
        X[:, 2] = y + np.random.normal(0, 1, 1000)
        X[:, 3] = y**2 + np.random.normal(0, 1, 1000)
        X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)

        # 5 irrelevant features
        X[:, 5] = np.random.normal(0, 1, 1000)
        X[:, 6] = np.random.poisson(1, 1000)
        X[:, 7] = np.random.binomial(1, 0.3, 1000)
        X[:, 8] = np.random.normal(0, 1, 1000)
        X[:, 9] = np.random.poisson(1, 1000)

        rfc = RandomForestClassifier()
        bt = BorutaPy(rfc)
        bt.fit(X, y)

        # make sure that only all the relevant features are returned
        self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
Beispiel #3
0
    def test_if_boruta_extracts_relevant_features(self):
        np.random.seed(42)
        y = np.random.binomial(1, 0.5, 1000)
        X = np.zeros((1000, 10))

        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000)
        z[z == -1] = 0
        z[z == 2] = 1

        # 5 relevant features
        X[:, 0] = z
        X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000)
        X[:, 2] = y + np.random.normal(0, 1, 1000)
        X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000)
        X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)

        # 5 irrelevant features
        X[:, 5] = np.random.normal(0, 1, 1000)
        X[:, 6] = np.random.poisson(1, 1000)
        X[:, 7] = np.random.binomial(1, 0.3, 1000)
        X[:, 8] = np.random.normal(0, 1, 1000)
        X[:, 9] = np.random.poisson(1, 1000)

        rfc = RandomForestClassifier()
        bt = BorutaPy(rfc)
        bt.fit(X, y)

        # make sure that only all the relevant features are returned
        self.assertItemsEqual(range(5), list(np.where(bt.support_)[0]))
    def perform_boruta_fs(self):

        if self.ml_pipeline.config.fs_boruta_flg:
            xtrain = self.ml_pipeline.x_train
            xtest = self.ml_pipeline.x_test
            ytrain = self.ml_pipeline.y_train

            self.jlogger.info("Inside BorutaFS, Before Shape Train: {}".format(
                xtrain.shape))
            self.jlogger.info("Inside BorutaFS, Before Shape Test: {}".format(
                xtest.shape))

            # ytrain = ytrain.values.ravel()

            rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)
            boruta_selector = BorutaPy(rfc,
                                       n_estimators='auto',
                                       random_state=50)
            boruta_selector.fit(xtrain.values, ytrain)
            xtrain_sel = boruta_selector.transform(xtrain.values)
            xtest_sel = boruta_selector.transform(xtest.values)
            sel_cols = xtrain.columns[boruta_selector.support_]

            # print("Inside BorutaFS, IN FeatureSelector get_feature_names ", sel_cols)

            train = pd.DataFrame(xtrain_sel, columns=sel_cols)
            test = pd.DataFrame(xtest_sel, columns=sel_cols)

            self.ml_pipeline.x_train = train
            self.ml_pipeline.x_test = test

            self.jlogger.info("Inside BorutaFS, After Shape Train: {}".format(
                train.shape))
            self.jlogger.info("Inside BorutaFS,  After Shape Test: {}".format(
                test.shape))
Beispiel #5
0
    def run(self):
        print("Here : ")
        df_dummies = pd.read_csv(
            data_transformation().output()['output1'].path)
        X_all = pd.read_csv(data_transformation().output()['output2'].path)
        y_all = pd.read_csv(data_transformation().output()['output3'].path)

        X_boruta = X_all.values
        y_boruta = y_all.values
        y_boruta = np.insert(y_boruta, 7031, 'NO')

        # Define random forest classifier, with utilising all cores and sampling in proportion to y labels
        rfc = RandomForestClassifier(n_jobs=-1)
        # Define Boruta feature selection method
        feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1)
        # Find all relevant features
        feature_selector.fit(X_boruta, y_boruta)
        #Transposing dataframe for ranking
        df_features_rank = df_dummies.drop(['Churn'], axis=1).T
        # Check ranking of features
        df_features_rank['Boruta_Rank'] = feature_selector.ranking_
        # Adding a variable 'Feature' in the dataframe
        df_features_rank['Feature'] = df_features_rank.index
        # Sort the dataframe as per Rank
        df_features_rank = df_features_rank.sort_values('Boruta_Rank')
        # Exctracting only top 2 ranked features
        df_top2_ranked_feature = df_features_rank.loc[
            df_features_rank['Boruta_Rank'].isin([1, 2])]
        # Selecting important featutres
        selected_features = df_top2_ranked_feature.index
        X_selected = df_dummies[selected_features]
        y_selected = df_dummies["Churn"]
        print(self.output())
        X_selected.to_csv(self.output()['output1'].path, index=False)
        y_selected.to_csv(self.output()['output2'].path, index=False)
Beispiel #6
0
def boruta_selector(df, y=None):
    Y = df[y]
    df = df.drop(y, axis=1)
    num_feat = df.select_dtypes(include=['int', 'float']).columns.tolist()
    cat_feat = df.select_dtypes(include=['object']).columns.tolist()
    pipe_num_tree = Pipeline(steps=[('imputer',
                                     SimpleImputer(strategy='median'))])
    pipe_cat_tree = Pipeline(
        steps=[('imputer', SimpleImputer(
            strategy='most_frequent')), ('cat_transformer', OrdinalEncoder())])
    preprocessor_tree = ColumnTransformer(
        transformers=[('num_preprocessor', pipe_num_tree,
                       num_feat), ('cat_preprocessor', pipe_cat_tree,
                                   cat_feat)])
    RF = Pipeline(
        steps=[('preprocessor_rf', preprocessor_tree),
               ('model_rf',
                RandomForestClassifier(random_state=123, max_depth=5))])
    X = preprocessor_tree.fit_transform(df)
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=5)
    # Criando o boruta
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             random_state=123,
                             max_iter=100)  # 500 iterações até convergir
    feat_selector.fit(X, Y)
    # Terceiro filtro com as features selecionadas pelo boruta
    cols_drop_boruta = [not x for x in feat_selector.support_.tolist()
                        ]  # apenas invertendo o vetor de true/false
    cols_drop_boruta = df.loc[:, cols_drop_boruta].columns.tolist()
    return cols_drop_boruta
Beispiel #7
0
def fitBorutaRF(
):  #Boruta documentation: https://pypi.python.org/pypi/Boruta/0.1.5
    print('Feature selection from Boruta RandomForestClassifier: ')

    rf = RandomForestClassifier(n_jobs=-1,
                                random_state=0,
                                max_depth=5,
                                class_weight='balanced')

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=0,
                             random_state=0)

    # find all relevant features
    feat_selector.fit(X_train.values, y_train.values)

    # check selected features: feat_selector.support_
    X_important_train = X_train.iloc[:, feat_selector.support_]
    X_important_test = X_test.iloc[:, feat_selector.support_]

    print("Boruta selected features for the model: ",
          list(X_important_train.columns))
    # check ranking of features: feat_selector.ranking_

    return X_important_train, X_important_test
def feature_engineering(X_all, y_all, df_dummies):
    # Change X and y to its values
    X_boruta = X_all.values
    y_boruta = y_all.values
    # Define random forest classifier, with utilising all cores and sampling in proportion to y labels
    rfc = RandomForestClassifier(n_jobs=-1)
    # Define Boruta feature selection method
    feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1)
    # Find all relevant features
    feature_selector.fit(X_boruta, y_boruta)
    #Transposing dataframe for ranking
    df_features_rank = df_dummies.drop(['Churn'], axis=1).T
    # Check ranking of features
    df_features_rank['Boruta_Rank'] = feature_selector.ranking_
    # Adding a variable 'Feature' in the dataframe
    df_features_rank['Feature'] = df_features_rank.index
    # Sort the dataframe as per Rank
    df_features_rank = df_features_rank.sort_values('Boruta_Rank')
    # Exctracting only top 2 ranked features
    df_top2_ranked_feature = df_features_rank.loc[
        df_features_rank['Boruta_Rank'].isin([1, 2])]
    # Selecting important featutres
    selected_features = df_top2_ranked_feature.index
    X_selected = df_dummies[selected_features]
    y_selected = df_dummies["Churn"]
    # Pickle the selected features for Form Uploads
    upload_featuredIndexFilePath = pickle_df_index(X_selected,
                                                   'featured_index_dict.pkl')
    return X_selected, y_selected, upload_featuredIndexFilePath
Beispiel #9
0
def select_features_by_boruta(X_train, X_test, y_train):
    model = RandomForestRegressor(
        n_estimators=50,
        max_depth=5,
        max_features='sqrt',
        n_jobs=-1,
        verbose=True,
        random_state=1
    )

    features_selector = BorutaPy(
        model,
        n_estimators='auto',
        perc=80,
        verbose=2,
        two_step=False,
        max_iter=100,
        random_state=1
    )

    features_selector.fit(X_train.values, y_train.values)
    X_train_selected = X_train.iloc[:, features_selector.support_]
    X_test_selected = X_test.iloc[:, features_selector.support_]
    feature_selected_cols = list(X_train_selected.columns)
    print('Selected features are: ', feature_selected_cols)

    return feature_selected_cols, X_train_selected, X_test_selected
Beispiel #10
0
def get_boruta(X, y):
    """
    Returns the features selected by Boruta algorithm for the passed dataset
    :param X: Numpy array of features
    :param y: Numpy array of target feature
    """
    from boruta import BorutaPy
    from sklearn.ensemble import RandomForestRegressor
    import numpy as np
    # Initialize Boruta
    forest = RandomForestRegressor(n_jobs=-1, max_depth=5)
    boruta = BorutaPy(
        estimator=forest,
        n_estimators='auto',
        max_iter=100  # number of trials to perform
    )
    # fit Boruta (it accepts np.array, not pd.DataFrame)
    boruta.fit(np.array(X), np.array(y))
    # print results
    green_area = X.columns[boruta.support_].to_list()
    blue_area = X.columns[boruta.support_weak_].to_list()
    print('features in the green area:', green_area)
    print('features in the blue area:', blue_area)
    print('features ranking :', boruta._rankings)
    return boruta
Beispiel #11
0
def by_boruta(data):

    import numpy as np
    from sklearn.ensemble import RandomForestClassifier
    from boruta import BorutaPy
    y = data.loc[:, 'type'].values
    y = y.astype(int)
    X = data.drop(columns=['type'])
    features = X.columns.to_list()
    X = X.values
    X = X.astype(int)
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced')
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2)
    feat_selector.fit(X, y)
    df = pd.DataFrame(data={
        'features': features,
        'ranking': feat_selector.ranking_
    })
    #df.columns = [col.strip() for col in list(df.columns)]
    #print(df.columns.to_list());
    df.sort_values(["ranking"], axis="rows", ascending=[False], inplace=True)
    #print(df.ranking)
    #print(feat_selector.ranking_)
    #print(df)
    top_features = df.features.to_list()
    return top_features
def Boruta_fs(x_train, y_train):
    """Perform feature selection using Boruta
    
    Arguments:
    x_train, y_train
    """
    estimator = RandomForestClassifier(n_jobs=-1,
                                       random_state=0,
                                       class_weight='balanced')
    selector = BorutaPy(estimator,
                        n_estimators='auto',
                        verbose=2,
                        random_state=0)  #perc=100, max_iter=100, two_step=True
    selector.fit(x_train.values, y_train.values)

    feature_names = x_train.columns.values
    df_rank = pd.DataFrame({
        'Rank': selector.ranking_,
        'Features': feature_names
    })  #finding ranked list
    confirmed_indices = np.where(
        selector.ranking_ == 1)  #saving the confirmed features
    confirmed_names = x_train.columns.values[confirmed_indices]
    df_rank_confirmed = pd.DataFrame(confirmed_names)  #print confirmed_names
    df_rank_confirmed.index += 1

    return df_rank, df_rank_confirmed
Beispiel #13
0
def Feature_sort(Feat_scale, Label, threads=4):  ##通过三种特征选择方法对特征进行排序

    ranks = {}
    ## Univariate feature selection
    Selector = SelectKBest(f_classif, k='all')
    Selector.fit_transform(Feat_scale, Label)
    ranks["Univariate_f"] = np.argsort(Selector.pvalues_)

    ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling
    ##从第1900左右起,后续的特征排序得较为可疑。
    rlogreg = RandomizedLogisticRegression(n_jobs=1,
                                           n_resampling=2000,
                                           selection_threshold=0,
                                           verbose=False,
                                           random_state=0)
    ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21.
    ##warnings.warn(msg, category=DeprecationWarning)
    rlogreg.fit(Feat_scale, Label)
    ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_))

    ## boruta based on randomforest n_jobs=**
    rf = RandomForestClassifier(random_state=0,
                                n_jobs=threads,
                                max_features='auto')
    feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0)
    feat_selector.fit(Feat_scale, Label)
    ranks["Boruta_f"] = np.argsort(feat_selector.ranking_)

    return (ranks)
Beispiel #14
0
class DataTransformerBoruta:
    def __init__(self, corr_th, n_est=500, seed=123):
        self.boruta = True
        rfc = RandomForestClassifier(n_estimators=n_est, class_weight="balanced", n_jobs=6)
        self.feature_selector = BorutaPy(rfc, n_estimators="auto", verbose=0, random_state=seed, max_iter=100)
        self.corr_rem = CorrelationRemover(corr_th)

    def fit_transform(self, X, y):
        X_arr = np.array(X)
        y_arr = np.array(y).reshape(-1)
        self.feature_selector.fit(X_arr, y_arr)
        X_columns = X.columns
        selected_columns = X_columns[self.feature_selector.support_]
        X = X[selected_columns]
        X = self.corr_rem.fit_transform(X)
        return X


    def transform(self, X):
        X_columns = X.columns
        selected_columns = X_columns[self.feature_selector.support_]
        X = X[selected_columns]
        X = self.corr_rem.transform(X)
        return X

    def get_selected_num(self):
        return self.feature_selector.n_features_ - self.corr_rem.get_removed_num()

    def get_selected_vec(self, X):
        col_names = X.columns
        selected_columns = col_names[self.feature_selector.support_]
        cor_removed = self.corr_rem.get_removed_vec()
        selected_columns = set(selected_columns) - set(cor_removed)
        return (np.array(list(selected_columns)) + 1) # +1 is because we count coumnf from 1.
Beispiel #15
0
def main():
    print("Begin Feature Selection Step...")
    print('-' * 60)
    print('Loading Data...')
    df = pd.read_csv("./data/my_midterm_train.csv")
    y = df['y']
    X = df.drop(['y'], axis=1)

    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2)
    print("Fitting Boruta...")
    # find all relevant features
    feat_selector.fit(X.values, y)

    print("Selected Features:")
    # check selected features
    print(feat_selector.support_)
    support = pd.DataFrame(feat_selector.support_)

    print("Selected Feature Rank:")
    # check ranking of features
    print(feat_selector.ranking_)
    ranking = pd.DataFrame(feat_selector.support_)
    # call transform() on X to filter it down to selected features
    print("Transforming X...")
    X_filtered = X.ix[:, feat_selector.support_]
    print("Writing Data...")
    support.to_csv("./work_dir/feature_support.csv", index=False)
    ranking.to_csv("./work_dir/feature_ranking.csv", index=False)
    combined_df = pd.concat([X_filtered, y], axis=1)
    combined_df.to_csv("./data/boruta_filtered_stacked_train.csv", index=False)
Beispiel #16
0
def select_features(X: pd.DataFrame,
                    y: pd.Series,
                    mode: str,
                    n_estimators: int = 50,
                    max_iter: int = 50,
                    perc: int = 75) -> List[str]:
    feat_estimator = LGBMFeatureEstimator(
        {
            "objective": "regression" if mode == "regression" else "binary",
            "metric": "rmse" if mode == "regression" else "auc",
            "learning_rate": 0.01,
            "verbosity": -1,
            "seed": 1,
            "max_depth": 7,
            "min_data_in_leaf": 3,
        }, n_estimators)

    feat_selector = BorutaPy(feat_estimator,
                             n_estimators=n_estimators,
                             max_iter=max_iter,
                             verbose=2,
                             random_state=1,
                             perc=perc)

    try:
        feat_selector.fit(X.values, y.values.ravel())
    except:
        pass

    return X.columns[feat_selector.support_].tolist()
def boruta_select(X_df, Y, perc_list=[20], allowed_perc_good=.5, allowed_perc_med=.70, samples=[1], multiclass=False):
    """
    Runs the Boruta selector

    :param X_df: The X Dataframe that the selector will run on
    :param Y: The y for the training of the selector
    :param perc_list: The percentages that boruta will be run on
    :param allowed_perc_good: How many times does one variable has to beat the random ones
    :param allowed_perc_med: How many times does one variable has to be tentative
    :param samples: nothing at this moment, possible expansion into sampling
    :param multiclass: If problem is multiclass or not
    :return: first dataframe is if the varible should be used, second is what variables were relevant at each percentage
    , third is what variables were tentative in each percentage
    """

    use_list = []

    y = Y.values.ravel()

    res_df_good = pd.DataFrame(index=X_df.columns)
    res_df_med = pd.DataFrame(index=X_df.columns)
    use_df = pd.DataFrame(index=X_df.columns)
    if multiclass:
        params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf',
                      'bagging_fraction': .8, 'bagging_freq': 1}
    else:
        params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf',
                      'bagging_fraction': .8, 'bagging_freq': 1}

    rf_bor = lgb.LGBMClassifier(**params_bor)

    for perc_ in perc_list:
        print('Starting on {}'.format(perc_))
        feat_selector = BorutaPy(rf_bor, n_estimators=100, verbose=0, random_state=None, max_iter=10,
                                 perc=perc_)

        feat_selector.fit(X_df.values, y)
        if perc_ == perc_list[0]:
            times_good = (feat_selector.support_) * 1
            times_kinda_good = (feat_selector.support_weak_) * 1
        else:
            times_good += (feat_selector.support_) * 1
            times_kinda_good += (feat_selector.support_weak_) * 1

        res_df_good[str(perc_)] = (feat_selector.support_) * 1
        res_df_med[str(perc_)] = (feat_selector.support_weak_) * 1

    times_good_max = times_good.max()
    times_med_max = times_good.max()
    keep = (((times_good >= allowed_perc_good * times_good_max) |
             (times_kinda_good >= allowed_perc_med * times_med_max)) & (
        times_good + times_kinda_good > 0))

    # res_df_good[str(perc_)] = times_good
    # res_df_med[str(perc_)] = times_kinda_good
    use_df['use'] = keep

    # print(times_good_max, sum(keep))

    return (use_df, res_df_good, res_df_med)
Beispiel #18
0
 def _boruta(self):
     self._info(f"Feature importance {self.tag}: Boruta algorithm")
     model_factory = ModelFactoryRandomForest(self.config, self.datasets,
                                              self.model_type)
     model = model_factory.get()
     boruta = BorutaPy(model, n_estimators='auto', verbose=2)
     boruta.fit(self.x_train, self.y_train)
     return boruta
Beispiel #19
0
def do_boruta(model, X, y, max_iter=500, random_state=42):
    selector = BorutaPy(clone(model),
                        n_estimators='auto',
                        verbose=0,
                        random_state=random_state,
                        max_iter=max_iter)
    selector.fit(X.values, y.values)
    print('do_feat_boruta: Done')
    return X.columns.values[selector.support_]
Beispiel #20
0
def find_subsystems_of_interest(studyName, groupsList, geneCounts, level,
                                percentage):
    """
    Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated  matrix from MG-RAST analysis page

    Args:
        studyName (str): directory (study name)
        groupsList (list): list of group names
        level (str): subsystems level at which to run Boruta
        percentage (int): threshold for Boruta feature selection


    Returns: None, outputs files with tentative genes/gene families of interest

    """

    numGeneCounts = geneCounts.select_dtypes(include=[np.number])
    Y = numGeneCounts.transpose().index.str.split('_').str[0].values
    samplingDepth = numGeneCounts.sum().median()
    os.chdir(studyName)
    for i in range(len(numGeneCounts.columns)):
        subsampleList = []
        if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth:
            meanSubsample = numGeneCounts[numGeneCounts.columns[i]]
        else:
            for j in range(100):
                sample = subsample_counts(
                    numGeneCounts[numGeneCounts.columns[i]].transpose().values,
                    int(samplingDepth))
                subsampleList.insert(j, sample)
            print("completed 100 subsamples for sample number " + str(i))
            meanSubsample = pd.Series(subsampleList).mean()
            #recodification: setting all values less than 1.01 to zero
            meanSubsample[meanSubsample < 1.01] = 0
        meanSubsample = 100 * meanSubsample / meanSubsample.sum()
        numGeneCounts[numGeneCounts.columns[i]] = meanSubsample
    numGeneCounts['level1'] = geneCounts['level1']
    numGeneCounts['level2'] = geneCounts['level2']
    numGeneCounts['level3'] = geneCounts['level3']
    numGeneCounts['function'] = geneCounts['function']
    countsLvl = numGeneCounts.groupby(level).sum()
    groupsDict = dict(enumerate(pd.Series(groupsList).unique()))
    dictGroups = {y: x for x, y in groupsDict.items()}
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=3)
    X = countsLvl.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(percentage))
    feat_selector.fit(X, Y)
    if len(countsLvl[feat_selector.support_]) > 0:
        countsLvl[feat_selector.support_].to_csv(str(level) + '_tentative.csv')
    countsLvl[feat_selector.support_weak_].to_csv(
        str(level) + '_tentative_weak.csv')
    os.chdir('..')
Beispiel #21
0
def boruta_fs(X, y, feat_names):
    rfc = RandomForestClassifier(n_estimators=10000, n_jobs=4, max_depth=1)
    boruta = BorutaPy(rfc, n_estimators='auto', verbose=2, max_iter=50)

    boruta.fit(X, y)

    results = sorted(zip(boruta.ranking_, feat_names), reverse=False)

    return [x[1] for x in results]
def get_boruta_features(est, X, y, mode):
    if mode == 'regression':
        rf = RandomForestRegressor(n_estimators=500, random_state=SEED)
    elif mode == 'classification':
        rf = RandomForestClassifier(n_estimators=500, random_state=SEED)
    boruta = BorutaPy(rf, n_estimators='auto')
    boruta.fit(X, y)
    X_features = X[:, boruta.support_]
    return X_features
Beispiel #23
0
def select_features(X, y, X_sub, feature_name, perc=10, max_depth=5, verbose=2):
  import sklearn.ensemble
  from boruta import BorutaPy

  rf = sklearn.ensemble.RandomForestRegressor(max_depth=max_depth)
  feat_selector = BorutaPy(rf, n_estimators='auto', perc=perc, verbose=verbose)
  feat_selector.fit(X.values, y)

  used_features = [feature_name[i] for i, x in enumerate(feat_selector.support_) if x]
  print(used_features)
  return feat_selector.transform(X.values), feat_selector.transform(X_sub.values)
Beispiel #24
0
def cal_boruta(df,target,n=50):
    y = df[target]                          
    X = df.drop([target], axis=1).values
    y = y.ravel()
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=10)
    feat_selector = BorutaPy(rf, n_estimators='auto', max_iter=n, verbose=2, random_state=1)
    feat_selector.fit(X, y)
    feature_df = pd.DataFrame(df.drop([target], axis=1).columns.tolist(), columns=['features'])
    feature_df['rank']=feat_selector.ranking_
    feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
    return feature_df
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame) -> BorutaPy:
    boruta_selector = BorutaPy(
        RandomForestClassifier(n_jobs=cpu_count(),
                               class_weight='balanced',
                               max_depth=5),
        n_estimators='auto',
        verbose=2,
        alpha=0.05,  # p_value
        max_iter=10,  # In practice one would run at least 100-200 times
        random_state=42)
    boruta_selector.fit(X.values, y.values.ravel())
    return boruta_selector
Beispiel #26
0
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame, name: str,
                        cicli: int) -> BorutaPy:
    boruta_selector = BorutaPy(RandomForestClassifier(n_jobs=cpu_count(),
                                                      class_weight='balanced',
                                                      max_depth=5),
                               n_estimators='auto',
                               verbose=2,
                               alpha=0.05,
                               max_iter=cicli,
                               random_state=42)
    boruta_selector.fit(X.values, y.values.ravel())
    return boruta_selector
Beispiel #27
0
    def fit(self, df, cfg):
        """
        Performs Boruta feature selction

        Parameters:
            df (dataframe): dataframe.
            cfg (dict): configuration dictionary.

        Returns:
            selected_features: list of selected variable names.
        """

        all_features = [x for x in df.columns if x not in cfg['drop_cols']+[cfg['ID_COL'], cfg['CTU_COL']]]

        X = df[all_features].values
        y = df[cfg['TE_TARGET_COL']].values.ravel()

        if (sum(y)/len(y)) < 0.1:
            class_ratio = (len(y) - sum(y))/sum(y)
            print ("Class Ratio:", class_ratio)
            class_weight = dict({1:class_ratio, 0:1.5})
            max_depth = 8
            n_estimators = 400
        else:
            class_weight = None
            max_depth = 5
            n_estimators = 200

        param = {
                     'bootstrap':True,
                     'class_weight':class_weight,
                     'criterion':'gini',
                     'max_depth': max_depth, 'max_features':'auto', 'max_leaf_nodes':None,
                     'min_impurity_decrease' :0.0, 'min_impurity_split':None,
                     'min_samples_leaf':2, 'min_samples_split':10,
                     'min_weight_fraction_leaf':0.0, 'n_estimators':n_estimators,
                     'oob_score':False,
                     'random_state':121,
                     'verbose':0,
                     'warm_start':False
            }


        rf = RandomForestClassifier(**param)

        feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=cfg['seed'], max_iter = cfg['max_iter'], perc = cfg['z_score_percentile'], two_step = cfg['two_step'])

        feat_selector.fit(X, y)

        selected_features = [col for (col, id_bool) in zip(all_features, feat_selector.support_) if id_bool]

        return selected_features
Beispiel #28
0
def boruta(dataset: pd.DataFrame, labels: np.array, max_iter: int, p_value_threshold: float, random_state: int) \
        -> pd.DataFrame:
    forest = RandomForestClassifier(n_jobs=cpu_count(),
                                    class_weight='balanced',
                                    max_depth=5)
    boruta_selector = BorutaPy(forest,
                               n_estimators='auto',
                               verbose=2,
                               alpha=p_value_threshold,
                               max_iter=max_iter,
                               random_state=random_state)
    boruta_selector.fit(dataset.values, labels)
    return dataset[dataset.columns[np.where(boruta_selector.support_ == True)]]
def boruta_algorithm(dataset, target_name):
    '''
    This function selects features in the dataset using an implementation 
    of the boruta algorithm
    '''
    print('USING BORUTA')
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
    feat_selector.fit(dataset.drop([target_name], axis=1).values, dataset[target_name].values.ravel())
    filtered = feat_selector.transform(dataset.drop([target_name], axis=1).values)
    generously_selected = feat_selector.support_weak_
    feat_names = dataset.drop([target_name], axis=1).columns
    return [name for name, mask in zip(feat_names, generously_selected) if mask]
Beispiel #30
0
    def boruta_tree(self, X_train_smote, y_train_res, X_test, y_test, n_features):
        """
        - Apply Boruta two times to preselect about 400 features.
        - Decrease amount of features to n_features using a Random Forest Classifier.
        """
        # Do Boruta once as it sometimes fails with so little data twice.
        for _ in range(1):

            from sklearn.metrics import f1_score # import again to avoid error...

            # Random Forests for Boruta.
            rf_boruta = RandomForestClassifier(n_jobs=-1, random_state=self.seed)

            # Perform Boruta.
            boruta = BorutaPy(rf_boruta, n_estimators='auto', verbose=0,
                          alpha=0.005, max_iter=30, perc=100, random_state=self.seed)
            boruta.fit(X_train_smote.values, y_train_res)

            # Select features and fit Logistic Regression.
            cols = X_train_smote.columns[boruta.support_]
            X_train_smote = X_train_smote[cols]
            est_boruta = LogisticRegression(random_state=self.seed)
            est_boruta.fit(X_train_smote, y_train_res)

            scores = cross_val_score(est_boruta, X_train_smote, y_train_res, cv=5)
            
            # Print accuracy.
            print("Accuracy of Boruta: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

        # Random Forest for extracting features.
        X_filtered = X_train_smote[cols]
        
        # Define selector.
        rf = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = self.seed)
        rf.fit(X_filtered, y_train_res)
        rf_pred = rf.predict(X_test[cols])
        
        # Print accuracy.
        print("Accuracy of Boruta Tree: {:.3f}".format(accuracy_score(y_test, rf_pred)))
        
        # Retrieve features and importance.
        feature_names = X_filtered.columns
        rf_coeff = pd.DataFrame({"feature": feature_names,"coefficient": rf.feature_importances_})
        rf_coeff_top = rf_coeff.sort_values(by="coefficient",ascending=False).head(n_features).set_index("feature")
        
        # Create dictionary with results.
        selected_features = rf_coeff_top.index.tolist()
        feature_importances = rf_coeff_top.coefficient.tolist()
        dictionary = {"Boruta Tree": [selected_features, feature_importances]}

        return dictionary
Beispiel #31
0
def getFeaturesRanking(X, y):
    rf = RandomForestRegressor(max_depth=5)
    #rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)
    try:
        feat_selector = BorutaPy(rf,
                                 n_estimators='auto',
                                 verbose=3,
                                 max_iter=20)
        feat_selector.fit(X.values, y.values)
        #print("names:",feat_selector.support_)
        #print("ranking:", feat_selector.ranking_)
        return feat_selector.ranking_
    except:
        return np.ones((len(names), ), dtype=np.int)