def train_svm_k_fold_Boruta(matrix,
                            target,
                            gamma,
                            linear=True,
                            nfeatures=5,
                            nsplits=10,
                            penalty="l2",
                            C=1,
                            multi_class="ovr",
                            kernel="rbf",
                            degree=3,
                            probability=False,
                            decision_function_shape="ovr"):
    scores = []
    confusion = []
    features = []
    parameters = {
        "Gamma": gamma,
        "Linear": linear,
        "C": C,
        "Kernel": kernel,
        "Degree": degree,
        "Average": [],
        "Scores": [],
        "Features": []
    }
    if (linear):
        best_svc = LinearSVC(penalty="l2", C=C, multi_class="ovr")
    else:
        best_svc = SVC(C=C,
                       kernel=kernel,
                       gamma=gamma,
                       degree=degree,
                       probability=probability,
                       decision_function_shape=decision_function_shape)
    cv = KFold(n_splits=nsplits, random_state=42, shuffle=False)
    for train_index, test_index in cv.split(matrix):
        #print("Train Index: ", train_index, "\n")
        #print("Test Index: ", test_index)
        X_train, X_test, y_train, y_test = matrix[train_index], matrix[
            test_index], target[train_index], target[test_index]
        # ---------------- FEATURE SELECTION ------------------------
        rf = RandomForestClassifier(n_jobs=-1,
                                    class_weight='balanced',
                                    max_depth=5)
        # define Boruta feature selection method
        feat_selector = BorutaPy(rf,
                                 n_estimators='auto',
                                 verbose=2,
                                 random_state=1,
                                 max_iter=50,
                                 perc=90)
        # find all relevant features - 5 features should be selected
        feat_selector.fit(X_train, y_train)
        # check selected features - first 5 features are selected
        feat_selector.support_
        # call transform() on X to filter it down to selected features
        X_train_filtered = feat_selector.transform(X_train)

        final_features = list()
        indexes = np.where(feat_selector.support_ == True)
        #print("Features Rilevanti :")

        for x in np.nditer(indexes):
            final_features.append(x)

        #print(final_features)

        # --------------- TRAINING ------------------------------
        # Training the model
        best_svc.fit(X_train[:, final_features[:, nfeatures]], y_train)

        #--------------- TESTING -------------------------------
        # Getting the scores of the model on the test set
        svc_predictions = best_svc.predict(X_test[:,
                                                  final_features[:,
                                                                 nfeatures]])
        # getting accuracy
        scores.append(
            best_svc.score(X_test[:, final_features[:, nfeatures]], y_test))
        parameters["Features"].append(final_features[:, nfeatures])
        # getting confusion matrix
        confusion.append(confusion_matrix(y_test, svc_predictions))
    parameters["Scores"].append(scores)
    parameters["Average"] = np.average(scores)
    return (scores, confusion, parameters)
Beispiel #2
0
y_Pred = modelDecision.predict(X_test)

#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
modelRandom = RandomForestClassifier(n_estimators=100,
                                     n_jobs=4,
                                     class_weight='balanced')
modelRandom = modelRandom.fit(X_train_res, y_train_res)
print("Results For Random Forest")
scoreRandom = modelRandom.score(X_test_res, y_test_res)
print("\nScore", scoreRandom * 100)

#Boruta feature elimination
boruta_selector = BorutaPy(modelRandom,
                           n_estimators='auto',
                           verbose=2,
                           max_iter=40)
boruta_selector.fit(X_train_res, y_train_res)

#XGBoost ...51.3
import xgboost as xgb
modelXgb = xgb.XGBClassifier(booster='gbtree',
                             objective="binary:logistic",
                             random_state=200)
modelXgb = modelXgb.fit(X_train, y_train)

print("Results For XGBoost")
scoreXgb = modelXgb.score(X_test, y_test)
print("\nScore", scoreXgb * 100)

y_Pred = modelXgb.predict(X_test)
Beispiel #3
0
 def test_get_tree_num(self):
     rfc = RandomForestClassifier(max_depth=10)
     bt = BorutaPy(rfc)
     self.assertEqual(bt._get_tree_num(10), 44, "Tree Est. Math Fail")
     self.assertEqual(bt._get_tree_num(100), 141, "Tree Est. Math Fail")
Beispiel #4
0
model = xgb.XGBClassifier()  #For Boruta
"""
Create shadow features – random features and shuffle values in columns
Train Random Forest / XGBoost and calculate feature importance via mean decrease impurity
Check if real features have higher importance compared to shadow features 
Repeat this for every iteration
If original feature performed better, then mark it as important 
"""

from boruta import BorutaPy

# define Boruta feature selection method. Experiment with n_estimators and max_iter
feat_selector = BorutaPy(model,
                         n_estimators=200,
                         verbose=2,
                         random_state=42,
                         max_iter=20)

# find all relevant features
feat_selector.fit(X_train_boruta, y_train_boruta)

# check selected features
print(feat_selector.support_)  #Should we accept the feature

# check ranking of features
print(feat_selector.ranking_)  #Rank 1 is the best
"""
Review the features
"""
feature_names = np.array(X_resampled.columns)  #Convert dtype string?
Beispiel #5
0
def run_boruta(tax_file,
               otu_tab,
               group_method,
               filter=None,
               fn=None,
               output_result_dir=None,
               subset_otu=None,
               max_depth=5,
               max_iter=1000,
               is_normalized=False):
    """

    :param tax_file:
    :param otu_tab:
    :param group_method: a function to group the sample, receive each sample,return a group label.
    :param filter: receive a tax level abbre. Such as, if you want genus level only without upper or lower,you can type 'g'.
    :param fn: output html absoulute filename.
    :param subset_otu: samples name you want to use only
    :return:
    """

    otu_tab = read_data(otu_tab)
    # check
    if not otu_tab.index.values[0].startswith('OTU'):
        otu_tab = otu_tab.T

    if subset_otu:
        otu_tab = otu_tab.loc[:, subset_otu]
        otu_tab = otu_tab[otu_tab.sum(1) != 0]

    # transpose into samples (rows) by OTUs (cols)
    otu_tab = otu_tab.T
    if not is_normalized:
        # normalization into relative abundance
        otu_tab = otu_tab.div(otu_tab.sum(axis=1), axis=0)

    otu_tab = otu_tab.loc[:, otu_tab.sum(0) != 0]
    otus = otu_tab.columns.values.tolist()

    otu_tax = get_otu_tax(tax_file, otus, filter=filter)
    # propagate with tax profiles
    try:
        tax_tab = get_tax_profile(otu_tab, otu_tax)
    except:
        import pdb
        pdb.set_trace()
    # tax_tab
    # get sample metadata
    samples = list(tax_tab.index)
    groups = [_ for _ in map(group_method, samples)]

    ###########################################################################################
    ### boruta ################################################################################
    ###########################################################################################
    X = tax_tab.values
    print(X.shape)
    y = groups
    features = tax_tab.columns
    max_depth = max_depth
    max_iter = max_iter

    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    # [tree pruning!!!] highly recommend using pruned trees with a depth between 3-7
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=max_depth,
                                random_state=123)
    # define Boruta feature selection method
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=1,
                             random_state=123,
                             max_iter=max_iter)
    # find all relevant features
    try:
        feat_selector.fit(X, y)
    except:
        import pdb
        pdb.set_trace()
    # check selected features
    #print feat_selector.support_
    # check ranking of features
    #print feat_selector.ranking_
    # call transform() on X to filter it down to selected features
    # X_filtered = feat_selector.transform(X)
    print("confirmed features:")
    print(features[feat_selector.support_])
    print("tentative features:")
    print(features[feat_selector.support_weak_])
    # find out the trend of importances.
    fea_status = []
    if len(set(y)) == 2:
        a, b = tuple(set(y))
        for fea in list(features):
            med_a = np.median(tax_tab.loc[[
                i for i, v in zip(tax_tab.index.values.tolist(), y) if v == a
            ], fea])
            med_b = np.median(tax_tab.loc[[
                i for i, v in zip(tax_tab.index.values.tolist(), y) if v == b
            ], fea])
            if med_a >= med_b:
                fea_status.append('(%s > %s)' % (a, b))
            else:
                fea_status.append('(%s < %s)' % (a, b))
        features = np.array(
            [i + v for i, v in zip(features.values.tolist(), fea_status)])
    # get feature importances
    # forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=max_depth,
    #                                 random_state=123, n_estimators=max_iter)
    # forest.fit(X, y)
    importances = feat_selector._get_imp(X, y)
    std = np.std([
        tree.feature_importances_
        for tree in feat_selector.estimator.estimators_
    ],
                 axis=0)
    # reverse sorted index
    colors = np.empty(len(features), dtype=object)
    colors.fill('blue')
    colors[feat_selector.support_] = 'red'
    colors[feat_selector.support_weak_] = 'yellow'
    indices = np.argsort(importances)
    selected_indices = indices[-40:]
    print(selected_indices)
    trace = go.Bar(
        y=features[selected_indices],
        x=importances[selected_indices],
        marker=dict(color=colors[selected_indices]),
        error_x=dict(visible=True, arrayminus=std[selected_indices]),
        orientation='h',
    )
    layout = go.Layout(title="Feature importances", margin=go.Margin(l=800))
    fig = go.Figure(data=[trace], layout=layout)
    if output_result_dir:
        if not os.path.isdir(output_result_dir):
            os.makedirs(output_result_dir)
        tax_tab.to_csv(output_result_dir + '/input_data.tab', sep='\t')
        tax_tab.loc[:, feat_selector.support_].to_csv(
            output_result_dir + '/confirmed_fetures_data.tab', sep='\t')
        tax_tab.loc[:, feat_selector.support_weak_].to_csv(
            output_result_dir + '/tentative_fetures_data.tab', sep='\t')
        tmp = pd.DataFrame(index=features[indices], columns=['importances'])
        tmp.loc[:, 'importances'] = importances[indices]
        tmp.to_csv(output_result_dir + '/fetures_importances.tab', sep='\t')
    if fn:
        ply.plot(fig, filename=fn)
    else:
        ply.plot(fig)
Beispiel #6
0
    def __init__(self, model, model_kwargs):

        self.model = model(**model_kwargs)
        self.boruta = BorutaPy(self.model)
Beispiel #7
0
            cmap=cmap,
            vmax=.3,
            center=0,
            square=True,
            linewidths=.5,
            cbar_kws={"shrink": .5})
plt.show()

##### Boruta for feature selection
X = station_trips.drop(["Trip_count", 'Date'], axis=1)
y = station_trips['Trip_count']

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestRegressor(n_jobs=-1, max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf,
                         n_estimators='auto',
                         verbose=2,
                         random_state=44,
                         max_iter=25,
                         perc=95)

# find all relevant features
feat_selector.fit(np.array(X), y)

# features selected by the Boruta algo
selected_features = X.columns[feat_selector.support_]
#selected_features = ['Station', 'is_workday', 'Precipitation', 'Temp_max', 'Temp_min']
Beispiel #8
0
if(number_of_training_samples > 4000):
    limit_train_samples = 4000
else:
    limit_train_samples = number_of_training_samples

train_input = train_input_test[:limit_train_samples,:]
train_output_d = train_output_d_test[:limit_train_samples]

if(number_of_features > 10):
    max_features = 10
else:
    max_features = number_of_features
bo_subset = 99*np.ones((1,max_features))

RF_c = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
xgboost_ensemble = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=200, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=1, seed=2, missing=None)
# XGBoost
boruta = BorutaPy(xgboost_ensemble, n_estimators='auto', verbose=2, random_state=1)
boruta.fit(train_input, np.ravel(train_output_d))
boruta_rank = boruta.ranking_
boruta_count = boruta.n_features_
relevant_indices = boruta_rank.argsort()[:boruta_count]
print relevant_indices
# Random Forest
boruta = BorutaPy(RF_c, n_estimators='auto', verbose=2, random_state=1)
boruta.fit(train_input, np.ravel(train_output_d))
boruta_rank = boruta.ranking_
boruta_count = boruta.n_features_
relevant_indices = boruta_rank.argsort()[:boruta_count]
print relevant_indices
Beispiel #9
0
        max_depth=60,
        n_estimators=100,
        class_weight='balanced',
    )
elif model == 'ln':
    clf = LinearSVC(C=0.01, penalty="l1", dual=False)
elif model == 'et':
    clf = ExtraTreesClassifier()
elif model == 'xgb':
    clf = XGBClassifier(learning_rate=1.0)
else:
    raise Exception("No model: %s" % model)

feat_selector = BorutaPy(clf,
                         n_estimators=1000,
                         verbose=2,
                         random_state=1,
                         alpha=0.00001,
                         max_iter=200)
new_df = feature_selection.run(df=new_df,
                               method=feature_selection_routine,
                               select_x=list(new_df.columns),
                               selector=feat_selector,
                               rank=2)
if sv_test:
    original_df = redshif.read(table_name="merge", routine_name="adm")
intersect_col = list(set(new_df.columns) & set(original_df.columns))
new_df = new_df.drop(intersect_col, axis=1)
new_df = pd.concat([original_df, new_df], axis=1)

# Write selected features
if sv_test:
Beispiel #10
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# iris データセットをロード
iris = datasets.load_iris()

x = iris['data']
# x = np.insert(x, 4, 100, axis=1)   # Rejectedを試す
# print(x)   # Rejectedを試す
y = iris['target']

# 学習データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

rf = RandomForestClassifier(n_estimators=20,n_jobs=-1)
feat_selector = BorutaPy(rf, n_estimators='auto', two_step=False,verbose=2, random_state=42)
feat_selector.fit(X_train,y_train)

# SepalLength	SepalWidth	PetalLength	PetalWidth	Name
# 「試行回数」、「重要と見做した特徴量の数」、「判断に悩んでいる特徴量の数」、「重要でないと判断した特徴量の数」
# Iteration: 	8 / 100
# Confirmed: 	4
# Tentative: 	0
# Rejected: 	1
print('feat_selector----')
print(feat_selector)
print('feat_selector.support_----')
print(feat_selector.support_)

print('特徴選択----')
df = pd.DataFrame(iris.data)
Beispiel #11
0
import pickle

data = pd.read_csv('framingham.csv')
data.drop(['education'], axis=1, inplace=True)

data.dropna(axis=0, inplace=True)

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

forest = RandomForestClassifier(n_estimators=1000,
                                n_jobs=-1,
                                class_weight='balanced')

# define Boruta feature selection method
feat_selector = BorutaPy(forest, n_estimators='auto', verbose=2)

# find all relevant features
feat_selector.fit(X, y)

top_features = data.columns[:-1][feat_selector.ranking_ <= 6].tolist()

top_features

X_top = data[top_features]
y = data['TenYearCHD']

res = sm.Logit(y, X_top).fit()
res.summary()

X = data[top_features]
def relevant_features(X, array_dict, y, params):
    """
    Determines the subset of features in X that are relevant to the outcome
    using the Boruta algorithm. The result are cross validated. 
        
    Parameters
    ----------
    X : pandas dataframe
        A data set where each row is an observation and each column a feature.
        
    y: numpy array
        A numpy array containing the targets
    
    params: dict,
        A dictionary containing the set of parameters use to initialize BorutaPy
        and determine the number of folds to use to validate the results.
    
    
    Examples
    --------
    # Initialize estimator
    estimator = RandomForestClassifier()
    
    # Define cv and BorutaPy parameters
     params = {'estimator': estimator,
               'cv': 5,
               'n_estimators': 1000,
               'max_iter': 100,
               'verbose': 50,
               'random_state': 42}
     
    # Get relevant feature labels
    labels = relevant_features(X = X, y = y, params = params)
    
    
    Returns
    -----
    labels: list
        A list with the labels identifying the relevant features in X.
    
    
    References
    ----------
    Find more details about Boruta here:
    https://github.com/scikit-learn-contrib/boruta_py
    
    """

    # Unpack params
    if 'cv' in params:
        cv = params['cv']
    else:
        cv = 5

    # Remove cv key from params so we can use with BorutaPy
    del params['cv']

    # Initiate variables
    feature_labels = list(X.columns)
    selected_features_mask = np.ones(len(feature_labels))
    counter = 0

    #Get K-folds indices
    kf = KFold(n_splits=cv)
    kf.get_n_splits(X)

    # Initiate progress bar
    status.printProgressBar(counter,
                            cv,
                            prefix='Progress:',
                            suffix='Complete',
                            length=50)

    # K-fold cross validation
    for train_index, val_index in kf.split(X):
        # Get train fold data
        X_train_fold = X.iloc[train_index, :]
        y_train_fold = y[train_index]

        # Define Boruta feature selection method
        feat_selector = BorutaPy(**params)

        # Find all relevant features
        feat_selector.fit(X_train_fold.values, y_train_fold)

        # Boruta selected feature mask
        selected_features_temp = feat_selector.support_

        # Update selected relevant features
        selected_features_mask = selected_features_mask * selected_features_temp

        # Update progress bar
        counter += 1
        status.printProgressBar(counter,
                                cv,
                                prefix='Progress:',
                                suffix='Complete',
                                length=50)

    # Boruta selected feature labels
    labels = [
        feature_labels[ii] for ii in range(len(feature_labels))
        if selected_features_mask[ii] == 1
    ]

    return labels
Beispiel #13
0
def classification(file_name):
    import time
    startTime = time.time()
    import pandas as pd
    import numpy as np
    import sklearn
    from sklearn.ensemble import RandomForestClassifier
    import matplotlib.pyplot as plt
    from boruta import BorutaPy
    from sklearn.model_selection import train_test_split
    df=pd.read_csv(file_name)
    df.replace([np.inf, -np.inf], np.nan)
    df=df.dropna()
    df = df.astype(float)
    y = df['Target'].values
    X=df.drop(['Target'],axis=1)
    col=X.columns.tolist()
    col = ",".join(col)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit_transform(X,y)
    X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3,random_state=33,stratify=y)


    #################################### BORUTA ####################################################


    rfc = RandomForestClassifier(n_estimators=200, n_jobs=4, class_weight='balanced', max_depth=6)
    boruta_selector = BorutaPy(rfc, n_estimators='auto')
    boruta_selector.fit(X_train.values, y_train) 
    rank=boruta_selector.ranking_.tolist()

    writefp=open("Ranks.csv",'w')

    s = [str(i) for i in rank] 
    res = (",".join(s))
    writefp.write('Classifiers'+','+ col + '\n')
    writefp.write('Boruta Feature Selection'+','+res + '\n')
    # writefp.write("\n\n\n")
    writefp.close() 

    ############################# RECURSIVE FEATURE ELIMINATION ###########################################


    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(solver='liblinear')
    rfe = RFE(model,1)
    fit = rfe.fit(X_train, y_train)
    Rank_rfe = fit.ranking_.tolist() 


    writefp=open("Ranks.csv",'a')

    s = [str(i) for i in Rank_rfe] 
    res = (",".join(s)) 
    writefp.write('Recursive Feature Elimination'+','+res+'\n')
    # writefp.write("\n\n\n")
    writefp.close() 


    ###################################### SELECT K BEST #####################################################


    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2, mutual_info_classif,f_classif

    num_features = len(X_train.columns)

    test = SelectKBest(score_func=f_classif, k=2)
    test.fit(X_train, y_train)
    scores = []
    for i in range(num_features): 
        scores.append(test.scores_[i])
            
    Ranks = sorted(scores, reverse = True)

    writefp=open("Ranks.csv",'a')

    s = [str(i) for i in Ranks] 
    res = (",".join(s)) 
    writefp.write('Select K Best,'+res+'\n')

    writefp.close() 


    # ################################## RANDOM FOREST CLASSIFIER #######################################


    # Create a random forest classifier
    clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

    # Train the classifier
    clf.fit(X_train, y_train)


    writefp=open("Ranks.csv",'a')

    s = [str(i) for i in clf.feature_importances_] 
    res = (",".join(s)) 
    writefp.write('Random Forest Classifier,'+res+'\n')
    writefp.close() 


    # ############################## EXTRA TREES CLASSIFIER #######################################


    #METHOD 5
    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                random_state=0)

    forest.fit(X_train, y_train)
    importances = forest.feature_importances_

    writefp=open("Ranks.csv",'a')

    s = [str(i) for i in importances] 
    res = (",".join(s)) 
    writefp.write('Extra Trees Classifier,'+res+'\n')
    writefp.close() 
    writefp.close() 

    # ############################ CORRELATION ########################################################

    corr = []
    for i in X.columns.tolist():
        corr.append(df['Target'].corr(df[i]))


    writefp=open("Ranks.csv",'a')
    s = [str(i) for i in corr] 
    res = (",".join(s))

    writefp.write('Correlation With Target,'+res+'\n')

    writefp.close()

    # ##################################################################################################

    endTime = time.time()
    final_time=endTime - startTime
    def convert(seconds): 
        seconds = seconds % (24 * 3600) 
        hour = seconds // 3600
        seconds %= 3600
        minutes = seconds // 60
        seconds %= 60
        
        return "%d:%02d:%02d" % (hour, minutes, seconds) 
      
    n = final_time
    print(convert(n))
Beispiel #14
0
def model_infer_iter_ens(data,
                         dm_model,
                         feat_labels,
                         target_name,
                         df_res,
                         y_categorical,
                         data_null,
                         perm=100):
    # iterative inference, ensemble (random forest) methods, with Boruta feature selection
    # works on ensemble methods as boruta requires _feature_importance

    x_train, y_train = data['train'].values()
    x_test, y_test = data['test'].values()

    #-------
    # full model
    dm_model.fit(x_train, y_train, x_test, y_test)
    df_res_sp = dm_model.evaluate(data, 'all', 'all', target_name, data_null,
                                  perm)
    df_res = df_res.append(df_res_sp, sort=False)

    # round 1
    sf = selectQuantile(dm_model, threshold=0.75, feat_names=feat_labels.name)
    feat_names_sel = sf.importance_sel.feature
    if len(feat_names_sel) < 1: return df_res, None
    x_tr, x_te = sf.transform_set(x_train, x_test)
    dm_model.fit(x_tr, y_train, x_te, y_test)

    # round 2
    sf = selectQuantile(dm_model, threshold=0.75, feat_names=feat_names_sel)
    feat_names_sel = sf.importance_sel.feature
    if len(feat_names_sel) < 1: return df_res, None
    x_tr, x_te = sf.transform_set(x_tr, x_te)
    dm_model.fit(x_tr, y_train, x_te, y_test)

    # round 3
    sf = selectQuantile(dm_model, threshold=0.75, feat_names=feat_names_sel)
    feat_names_sel = sf.importance_sel.feature
    if len(feat_names_sel) < 1: return df_res, None
    x_tr, x_te = sf.transform_set(x_tr, x_te)

    # boruta feature selection
    dm_model.model.set_params(max_depth=7)
    feat_selector = BorutaPy(dm_model.model, n_estimators='auto', verbose=0)
    feat_selector.fit(x_tr, y_train)

    feat_names_sel = feat_names_sel[feat_selector.support_]
    if len(feat_names_sel) < 1: return df_res, None
    x_tr = feat_selector.transform(x_tr)
    x_te = feat_selector.transform(x_te)
    sf = _featSelect_base()
    sf.importance_sel = pd.DataFrame(feat_names_sel.copy())

    # reduced model
    dm_model.fit(x_tr, y_train, x_te, y_test)

    data['train']['x'] = x_tr
    data['test']['x'] = x_te
    data_null['test']['x'] = x_te
    df_res_sp = dm_model.evaluate(data, 'topfeat', 'topfeat', target_name,
                                  data_null, perm)
    df_res = df_res.append(df_res_sp, sort=False)

    return df_res, sf
Beispiel #15
0
# RandomForrestClassifier is used here as the estimator for Boruta.
# The max_depth of the tree is advised to be between 3 to 7 for better result, set most setting to default.

rf = RandomForestClassifier(criterion='gini',
                            n_estimators=500,
                            max_features='auto',
                            oob_score=True,
                            random_state=1,
                            n_jobs=-1,
                            max_depth=6)

X_boruta = features.values
y_boruta = target.values

boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2)
start_time = timer(None)
boruta_selector.fit(X_boruta, y_boruta)
timer(start_time)

# In[ ]:

# number of selected features after feature selection process

print('\n Number of selected features:')
print(boruta_selector.n_features_)

# In[ ]:

# Put selected list into pandas DataFrame in ascending sort
Beispiel #16
0
def model_infer_ens_custom(data,
                           dm_model,
                           feat_labels,
                           target_name,
                           df_res,
                           y_categorical,
                           data_null,
                           perm=100,
                           sf_iterThresholds=[0.75, 0, 75, 0.75],
                           sf_topK=None):
    # ensemble (random forest) methods, with Boruta feature selection
    # works on ensemble methods as boruta requires _feature_importance
    # with custom feature selection (either iterative or topK features)
    # this technically is generic enough that we don't need model_infer_iter_ens, but that method
    # is kept for backward compatibility

    x_train, y_train = data['train'].values()
    x_test, y_test = data['test'].values()

    # -------
    # full model
    dm_model.fit(x_train, y_train, x_test, y_test)
    df_res_sp = dm_model.evaluate(data, 'all', 'all', target_name, data_null,
                                  perm)
    df_res = df_res.append(df_res_sp, sort=False)

    # set up feat_names_sel values and copy train sets, for use in iterative selection next
    x_tr = x_train.copy()
    x_te = x_test.copy()
    feat_names_sel = feat_labels.name

    for threshold in sf_iterThresholds:
        sf = selectQuantile(dm_model,
                            threshold=threshold,
                            feat_names=feat_names_sel)
        feat_names_sel = sf.importance_sel.feature
        if len(feat_names_sel) < 1: return df_res, None
        x_tr, x_te = sf.transform_set(x_tr, x_te)
        dm_model.fit(x_tr, y_train, x_te, y_test)

    if sf_topK:
        sf = selectKFeat(dm_model, k=sf_topK, feat_names=feat_names_sel)
        feat_names_sel = sf.importance_sel.feature
        if len(feat_names_sel) < 1: return df_res, None
        x_tr, x_te = sf.transform_set(x_tr, x_te)
        dm_model.fit(x_tr, y_train, x_te, y_test)

    # boruta feature selection
    dm_model.model.set_params(max_depth=7)
    feat_selector = BorutaPy(dm_model.model, n_estimators='auto', verbose=0)
    feat_selector.fit(x_tr, y_train)

    feat_names_sel = feat_names_sel[feat_selector.support_]
    if len(feat_names_sel) < 1: return df_res, None
    x_tr = feat_selector.transform(x_tr)
    x_te = feat_selector.transform(x_te)
    sf = _featSelect_base()
    sf.importance_sel = pd.DataFrame(feat_names_sel.copy())

    # reduced model
    dm_model.fit(x_tr, y_train, x_te, y_test)

    data['train']['x'] = x_tr
    data['test']['x'] = x_te
    data_null['test']['x'] = x_te
    df_res_sp = dm_model.evaluate(data, 'topfeat', 'topfeat', target_name,
                                  data_null, perm)
    df_res = df_res.append(df_res_sp, sort=False)

    return df_res, sf
ld_xn = ld.iloc[:,0:10]


# In[51]:


import numpy as np
ld_dup = np.array(ld_xn)
ld_dup


# In[52]:


boruta_feature_selector = BorutaPy(rf, random_state = 111, max_iter =25, perc = 100, verbose = 2)
boruta_feature_selector


# In[53]:


#do feature selection on your entire data

boruta_feature_selector.fit(ld_dup,ld_y)


# In[54]:


boruta_feature_selector.support_
Beispiel #18
0
    y = groups
    features = tax_tab.columns
    max_depth = 5
    max_iter = 500

    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    # [tree pruning!!!] highly recommend using pruned trees with a depth between 3-7
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=max_depth,
                                random_state=123)
    # define Boruta feature selection method
    feat_selector = BorutaPy(rf,
                             n_estimators='balanced',
                             verbose=1,
                             random_state=123,
                             max_iter=max_iter)
    # find all relevant features
    feat_selector.fit(X, y)
    # check selected features
    #print feat_selector.support_
    # check ranking of features
    #print feat_selector.ranking_
    # call transform() on X to filter it down to selected features
    # X_filtered = feat_selector.transform(X)
    print("confirmed features:")
    print(features[feat_selector.support_])
    print("tentative features:")
    print(features[feat_selector.support_weak_])
Beispiel #19
0
Modelling Points

"""
##################################################################################################  

X = df.drop(columns = ['web_name', 'total_points'])
y = df['total_points']

###initialize Boruta
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 7
)
boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 200 # number of trials to perform
)
### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X), np.array(y))
### print results
green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()

df_chosen = df[green_area]


### initialise linear regression for points predicitons
reg = LinearRegression().fit(df_chosen,y)

predictions = reg.predict(df_chosen)
def relevant_features(X, y, params):
    # Unpack params
    if 'cv' in params:
        cv = params['cv']
    else:
        cv = 5

    # Remove cv key from params so we can use with BorutaPy
    del params['cv']

    # Initiate variables
    feature_labels = list(X.columns)
    selected_features_mask = np.ones(len(feature_labels))
    counter = 0

    # Get K-folds indices
    kf = KFold(n_splits=cv)
    kf.get_n_splits(X)

    # Initiate progress bar
    status.printProgressBar(counter,
                            cv,
                            prefix='Progress:',
                            suffix='Complete',
                            length=50)

    # K-fold cross validation
    for train_index, val_index in kf.split(X):
        # Get train fold data
        X_train_fold = X.iloc[train_index, :]
        y_train_fold = y[train_index]

        # Define Boruta feature selection method
        feat_selector = BorutaPy(**params)

        # Find all relevant features
        feat_selector.fit(X_train_fold.values, y_train_fold)

        # Boruta selected feature mask
        selected_features_temp = feat_selector.support_

        # Update selected relevant features
        selected_features_mask = selected_features_mask + selected_features_temp

        # Update progress bar
        counter += 1
        status.printProgressBar(counter,
                                cv,
                                prefix='Progress:',
                                suffix='Complete',
                                length=50)

    # Boruta selected feature labels
    labels = [
        feature_labels[ii] for ii in range(len(feature_labels))
        if selected_features_mask[ii] >= 3
    ]
    if len(labels) < 4:
        labels = [
            feature_labels[ii] for ii in range(len(feature_labels))
            if selected_features_mask[ii] > 0
        ]
    if len(labels) < 4:
        labels = labels + [feature_labels[ii] for ii in range(4)]
    return labels
Beispiel #21
0
    X_train = train[X_cols]
    X_test = test[X_cols]
    
    train_scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = train_scaler.transform(X_train)
    X_test_scaled = train_scaler.transform(X_test)
    Y_trainval = Y_train.values
    return X_test, X_test_scaled, X_train, X_train_scaled, Y_test, Y_train, Y_trainval, M_train, M_test, target, sample_weights

def selBoruta(X_test, X_train, X_train_scaled, Y_trainval, n_jobs=-1, class_weight='balanced', max_depth=5,n_estimators='auto', verbose=2, random_state=1, rseed = 80085)
    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=n_jobs, class_weight=class_weight, max_depth=max_depth)
    
    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators=n_estimators, verbose=verbose, random_state=random_state)
    np.random.seed(rseed)
    feat_selector.fit(X_train_scaled, Y_trainval)
    
    criteria = pd.Series(feat_selector.support_)
    X_train_boruta = feat_selector.transform(X_train_scaled)
    X_train_boruta = pd.DataFrame(X_train_boruta, index = X_train.index, columns = X_train.columns[criteria].values)
    
    criteria = pd.Series(feat_selector.support_)
    X_test_boruta = feat_selector.transform(X_test_scaled)
    X_test_boruta = pd.DataFrame(X_test_boruta, index = X_test.index, columns = X_test.columns[criteria].values)

    return X_test_boruta, X_train_boruta


def trainSVC(X_train_boruta, Y_train, param_set, score, sample_weights):
Beispiel #22
0
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from ML_UtilsModule import Data_Management, Normalization
from boruta import BorutaPy

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X, y = Data_Management.load_csv_types_features("pokemon.csv", [
    "hp", "attack", "defense", "sp_attack", "sp_defense", "speed", "height_m",
    "weight_kg", "percentage_male", "generation"
])
y = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)
Beispiel #23
0
# Saving feature names for later use
feature_names = X.columns

# Trasforming X into an array
X = X.values

if BORUTA:
    rf = RandomForestRegressor(n_estimators=200,
                               random_state=1234,
                               n_jobs=-1,
                               max_depth=10)

    # Boruta
    feat_selector = BorutaPy(rf,
                             n_estimators="auto",
                             verbose=2,
                             random_state=1,
                             perc=70)
    feat_selector.fit(X, y)

    # Selected vars
    boruta_vars = feature_names[feat_selector.support_].to_list()

    # Removed vars
    removed_vars = [var for var in feature_names if var not in selected_vars]

    # Saving Boruta vars
    filename = "boruta_perc_70"
    outfile = open(filename, 'wb')
    pickle.dump(boruta_vars, outfile)
    outfile.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Date    : 2021-01-24
# @Contact    : [email protected]
import numpy as np
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import ExtraTreesClassifier
from joblib import dump
from joblib import load
import os

boruta = BorutaPy(ExtraTreesClassifier(max_depth=5, n_jobs=4),
                  n_estimators='auto',
                  max_iter=1000,
                  random_state=0,
                  verbose=2)

train = load('data/train2.pkl')
train.fillna(0, inplace=True)
train[np.isinf(train)] = 0
y = train.pop('label')
boruta.fit(train, y)
dump(boruta, 'data/boruta3.pkl')
os.system(
    'google-chrome https://ssl.gstatic.com/dictionary/static/sounds/oxford/ok--_gb_1.mp3'
)
print(boruta)
### Feature selection using Boruta 


# In[74]:


get_ipython().system('pip install boruta')
from boruta import BorutaPy


# In[75]:


rf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=5)
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)   # initialize the boruta selector
boruta_selector.fit(np.array(sX_train), np.array(y_train))  


# In[76]:


print("Selected Features: ", boruta_selector.support_)    # check selected features
print("Ranking: ",boruta_selector.ranking_)               # check ranking of features
print("No. of significant features: ", boruta_selector.n_features_)


# In[77]:


selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
Beispiel #26
0
def trainKFolds(X, Y, eval, folds=5):
    if eval == "reg":
        eval_metric = "rmse"
        eval_metric2 = "mae"
    else:
        eval_metric = "logloss"
        eval_metric2 = "auc"
    np.random.seed(
        None
    )  #removing any seed to ensure that the folds are created differently

    #initialize empty lists - not the most efficient, but it works
    bar_times = []
    boruta_times = []
    y_hat = []
    y_hat2 = []
    y_hat_BR = []
    y_hat_BR2 = []
    y_hat_boruta = []
    y_hat_boruta2 = []
    y_actual = []
    fold = []

    #Start the cross validation
    kf = KFold(n_splits=folds)
    i = 1
    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], Y[
            train], Y[test]

        #Get predictions on all features
        y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric)
        y_pred2 = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric2)

        #BoostARoota - tune to metric 1
        tmp = time.time()
        BR_vars = BoostARoota2(X_train, y_train, metric=eval_metric)
        bar_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric)

        #BoostARoota - tune to metric 2
        tmp = time.time()
        BR_vars = BoostARoota2(X_train, y_train, metric=eval_metric2)
        bar_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR_preds2 = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric2)

        # #Boruta - get predictions
        tmp = time.time()
        rf = RandomForestClassifier(n_jobs=-1,
                                    class_weight='auto',
                                    max_depth=5)
        feat_selector = BorutaPy(rf,
                                 n_estimators='auto',
                                 verbose=2,
                                 random_state=1)
        feat_selector.fit(X_train.values, y_train.values)
        boruta_times.append(time.time() - tmp)
        X_train_filter = feat_selector.transform(X_train.values)
        X_test_filter = feat_selector.transform(X_test.values)
        Boruta_preds = TrainGetPreds(X_train_filter,
                                     y_train,
                                     X_test_filter,
                                     metric=eval_metric)
        Boruta_preds2 = TrainGetPreds(X_train_filter,
                                      y_train,
                                      X_test_filter,
                                      metric=eval_metric2)

        # evaluate predictions and append to lists
        y_hat.extend(y_pred)
        y_hat2.extend(y_pred2)
        y_hat_BR.extend(BR_preds)
        y_hat_BR2.extend(BR_preds2)
        y_hat_boruta.extend(Boruta_preds)
        y_hat_boruta2.extend(Boruta_preds2)
        y_actual.extend(y_test)
        #Set the fold it is trained on
        fold.extend([i] * len(y_pred))
        i += 1

    #Start building the array to be passed out; first is the timings, then the eval results
    values = [np.mean(boruta_times), np.mean(bar_times)]
    #Build the dataframe to pass into the evaluation functions
    results = pd.DataFrame({
        "y_hat": y_hat,
        "y_hat2": y_hat2,
        "Fold": fold,
        "y_hat_BR": y_hat_BR,
        "y_hat_BR2": y_hat_BR2,
        "y_hat_boruta": y_hat_boruta,
        "y_hat_boruta2": y_hat_boruta2,
        "y_actual": y_actual
    })

    #then append the evaluation results to values
    values.extend(evalResults(results, eval=eval))

    return values
Beispiel #27
0
def feature_selection(output_dir,
                      mm,
                      ds,
                      y,
                      X,
                      use_fs_algos=["all"],
                      curr_round=False):
    if not use_fs_algos:
        use_fs_algos = ["all"]

    #setup test/train arrays
    test_X_arr, train_X_arr = ds.get_test_train_X_arrays(X)
    test_y_arr, train_y_arr = ds.get_test_train_var_arrays(y)
    categorical_bool = ds.get_categorical_bool(X)
    random_variables = ds.get_random_variables()

    print("Shape of Training Features: " + str(train_X_arr.shape))

    #FULL FEATURE SELECTION ALGOS
    algo_dfs = []
    #Mutual Information
    if "mutual_info" in use_fs_algos or "all" in use_fs_algos:
        print("Mutual Info Algo")
        t1 = time()
        mi = mutual_info_regression(train_X_arr,
                                    train_y_arr,
                                    discrete_features=categorical_bool)
        mi /= np.max(mi)
        mi_df = var_importance_table(mi, X, 'mutual_info')
        t2 = time()
        t = (t2 - t1) / 60
        print("Mutual Info completed in " + '{0:.2f}'.format(t) + "m.\n")
        algo_dfs.append(mi_df)

    #F_regression
    if "f_regress" in use_fs_algos or "all" in use_fs_algos:
        print("F regression")
        t1 = time()
        f_test, _ = f_regression(train_X_arr, train_y_arr)
        f_test /= np.max(f_test)
        f_df = var_importance_table(f_test, X, 'f_regression')
        t2 = time()
        t = (t2 - t1) / 60
        print("F regression completed in " + '{0:.2f}'.format(t) + "m.\n")
        algo_dfs.append(f_df)

    #Normal RF
    if "fs_rf" in use_fs_algos or "all" in use_fs_algos:
        model_name = fs_step + "_rf"
        rf_model = mm.create_rf_model(ds, train_X_arr, train_y_arr, y, X,
                                      model_name)
        imp_df = rf_model.get_importance_df()
        algo_dfs.append(imp_df)

    #Normal GBR
    if "fs_gbr" in use_fs_algos or "all" in use_fs_algos:
        model_name = fs_step + "_gbr"
        gbr_model = mm.create_gbr_model(ds, train_X_arr, train_y_arr, y, X,
                                        model_name)
        imp_df = gbr_model.get_importance_df()
        algo_dfs.append(imp_df)

    #Microsoft LightGBM
    if "fs_lgb" in use_fs_algos or "all" in use_fs_algos:
        model_name = fs_step + "_lgbm"
        eval_set = [(test_X_arr, test_y_arr)]
        lgbm_model = mm.create_lgbm_model(ds, train_X_arr, train_y_arr, y, X,
                                          model_name, eval_set)
        imp_df = lgbm_model.get_importance_df()
        algo_dfs.append(imp_df)

    rank_dfs = []
    #Boruta RF
    if "boruta_rf" in use_fs_algos or "all" in use_fs_algos:
        print("Boruta RF")
        t1 = time()
        rf = RandomForestRegressor(n_jobs=-1)
        fs_selector = BorutaPy(rf,
                               n_estimators='auto',
                               random_state=3142,
                               max_iter=70)
        fs_selector.fit(train_X_arr, train_y_arr)
        scores = fs_selector.ranking_
        rf_df = var_importance_table(scores, X, 'boruta_rf')
        t2 = time()
        t = (t2 - t1) / 60
        print("Boruta RF completed in " + '{0:.2f}'.format(t) + "m.\n")
        rank_dfs.append(rf_df)

    #Boruta GBR
    if "boruta_gbr" in use_fs_algos or "all" in use_fs_algos:
        print("Boruta GBR")
        t1 = time()
        gbr = GradientBoostingRegressor()
        fs_selector = BorutaPy(gbr,
                               n_estimators='auto',
                               random_state=3142,
                               max_iter=70)
        fs_selector.fit(train_X_arr, train_y_arr)
        scores = fs_selector.ranking_
        gbr_df = var_importance_table(scores, X, 'boruta_gbr')
        t2 = time()
        t = (t2 - t1) / 60
        print("Boruta GBR completed in " + '{0:.2f}'.format(t) + "m.\n")
        rank_dfs.append(gbr_df)

    #Recursive Feature Elimination with SVR
    if "recursive_svr" in use_fs_algos or "all" in use_fs_algos:
        print("Recursive SVR")
        t1 = time()
        svr = SVR(kernel="linear")
        rfecv = RFECV(estimator=svr, step=1, cv=KFold(3), scoring='accuracy')
        rfecv.fit(train_X_arr, train_y_arr)

        #print("Optimal number of features : %d" % rfecv.n_features_)
        ## Plot number of features VS. cross-validation scores
        #plt.figure()
        #plt.xlabel("Number of features selected")
        #plt.ylabel("Cross validation score (nb of correct classifications)")
        #plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        #plt.show()

        scores = rfecv.ranking_
        rsvc_df = var_importance_table(scores, X, 'recursive_svc')
        t2 = time()
        t = (t2 - t1) / 60
        print("Recursive SVR completed in " + '{0:.2f}'.format(t) + "m.\n")
        rank_dfs.append(rsvc_df)

    # set model metadata to include round # and
    mm.set_metadata_feature_live_models("round", curr_round)

    temp_df = pd.DataFrame(X, columns=['var'])
    if not algo_dfs:
        algo_dfs = [temp_df]
    if not rank_dfs:
        rank_dfs = [temp_df]

    if algo_dfs and rank_dfs:
        #Join all dfs
        #convert below code into a function (for memory reasons)
        #Get all recorded importances
        df_final_imp = reduce(
            lambda left, right: pd.merge(left, right, how='outer', on='var'),
            algo_dfs)
        df_final_imp['sum'] = df_final_imp.sum(axis=1)
        new_rank_dfs = rank_dfs + [df_final_imp]
        df_final = reduce(
            lambda left, right: pd.merge(left, right, how='outer', on='var'),
            new_rank_dfs)
        #df_final.to_csv("./temp1.csv")
        df_final.sort_values('sum', ascending=False, inplace=True)
        sum_ordered_vars = df_final['var'].values.tolist(
        )  #keep this list to reorder garbage variables
        #Output dfs to fsdb.csv
        df_final.to_csv(output_dir + '/feature_importances.csv', index=False)

        #Get ordered lists of features
        df_final_vars = df_final.copy(deep=True)
        final_rank_df = reduce(
            lambda left, right: pd.merge(left, right, how='outer', on='var'),
            rank_dfs)
        rank_cols = list(set(final_rank_df.columns.tolist()) - set(['var']))
        imp_cols = list(
            set(df_final_vars.columns.tolist()) - set(['var']) -
            set(rank_cols))
        for col in rank_cols:
            temp_df = df_final[['var', col]]
            temp_series = temp_df.sort_values(col)['var'].values.tolist()
            df_final_vars.loc[:, col] = temp_series
        for col in imp_cols:
            temp_df = df_final[['var', col]]
            temp_series = temp_df.sort_values(
                col, ascending=False)['var'].values.tolist()
            df_final_vars.loc[:, col] = temp_series
        df_final_vars = df_final_vars[rank_cols + imp_cols]
        df_final_vars.to_csv(output_dir + '/ordered_features.csv', index=False)

        #Get ordered list of selected features by random importance screening
        sel_df = pd.DataFrame()
        for col in rank_cols:
            temp_rank_df = df_final[['var', col]]
            temp_rank_df = temp_rank_df.query(col + " == 1 | " + col + " == 2")
            temp_rank_df.sort_values(col, ascending=False)
            ordered_ftrs = temp_rank_df['var'].values.tolist()
            temp_results = pd.DataFrame(ordered_ftrs, columns=[col])
            sel_df = pd.concat([sel_df, temp_results], axis=1)
        for col in imp_cols:
            temp_ftr_list = df_final_vars[col].values.tolist()
            ordered_ftrs = grab_top_features(temp_ftr_list, random_variables)
            temp_results = pd.DataFrame(ordered_ftrs, columns=[col])
            sel_df = pd.concat([sel_df, temp_results], axis=1)
        sel_df.to_csv(output_dir + '/selected_features.csv', index=False)

        #grab all important features from selected features df
        fi_cols = [x + "_importance" for x in use_fs_algos]

        if fi_cols:
            sel_cols = sel_df.columns.tolist()
            fi_cols = ordered_subset(sel_cols, fi_cols)
            sel_df = sel_df[fi_cols]
        important_features = grab_all_features(sel_df)

        #adding back random variables
        important_features = list(set(important_features + random_variables))

        if max_drop_dict:
            num_ftrs = len(sum_ordered_vars)
            max_drop = get_max_drop(num_ftrs, max_drop_dict)
            if max_drop:
                #ensure to count only non random variables in garbage
                garbaged = set(sum_ordered_vars) - set(important_features)
                garbaged = ordered_subset(sum_ordered_vars, garbaged)
                curr_drop = len(garbaged)
                if curr_drop > max_drop:
                    print(
                        "Restricting the # of variables dropped from {0} to {1}"
                        .format(curr_drop, max_drop))
                    new_garbage = garbaged[-max_drop:]
                    important_features = list(
                        set(sum_ordered_vars) -
                        set(new_garbage))  #this will add back randoms
                    important_features = ordered_subset(
                        sum_ordered_vars, important_features)

        #generate importance graphs
        imp_cols_plus = imp_cols + ['var']
        imp_df = df_final[imp_cols_plus]
        annot = "# of Features: " + str(len(X))
        if curr_round:
            annot = annot + "\nRound #: " + str(curr_round)
        imp_figs = feature_importance_bargraphs(imp_df, "", annot)

        return important_features, imp_figs
    def fit(self, df, cfg):
        """
        Performs Boruta feature selction
        
        Parameters:
            df (dataframe): dataframe.
            cfg (dict): configuration dictionary.
            
        Returns:
            selected_features: list of selected variable names.
        """

        all_features = [
            x for x in df.columns
            if x not in cfg['drop_cols'] + [cfg['ID_COL'], cfg['CTU_COL']]
        ]

        X = df[all_features].values
        y = df[cfg['TE_TARGET_COL']].values.ravel()

        if (sum(y) / len(y)) < 0.1:
            class_ratio = (len(y) - sum(y)) / sum(y)
            print("Class Ratio:", class_ratio)
            class_weight = dict({1: class_ratio, 0: 1.5})
            max_depth = 8
            n_estimators = 400
        else:
            class_weight = None
            max_depth = 5
            n_estimators = 200

        param = {
            'bootstrap': True,
            'class_weight': class_weight,
            'criterion': 'gini',
            'max_depth': max_depth,
            'max_features': 'auto',
            'max_leaf_nodes': None,
            'min_impurity_decrease': 0.0,
            'min_impurity_split': None,
            'min_samples_leaf': 2,
            'min_samples_split': 10,
            'min_weight_fraction_leaf': 0.0,
            'n_estimators': n_estimators,
            'oob_score': False,
            'random_state': 121,
            'verbose': 0,
            'warm_start': False
        }

        rf = RandomForestClassifier(**param)

        feat_selector = BorutaPy(rf,
                                 n_estimators='auto',
                                 verbose=2,
                                 random_state=cfg['seed'],
                                 max_iter=cfg['max_iter'],
                                 perc=cfg['z_score_percentile'],
                                 two_step=cfg['two_step'])

        feat_selector.fit(X, y)

        selected_features = [
            col for (col, id_bool) in zip(all_features, feat_selector.support_)
            if id_bool
        ]

        return selected_features
Beispiel #29
0
Selección de características (Feature Selection)
Probablemnte tengáis que instalar el paquete boruta: pip install boruta
Realizamos una selección de características usando Random Forest como estimador.
Configuramos Random Forest
'''
print("------ Selección de características...")
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
'''
Configuramos BorutaPy para la selección de características en función de la configuración hecha
para Random Forest
'''
feat_selector = BorutaPy(rf,
                         max_iter=9,
                         n_estimators=200,
                         verbose=0,
                         random_state=123456)

#Lo aplicamos sobre nuestros datos.
feat_selector.fit(X, y)

#Comprobar las características (variables) seleccionadas.
all_features = numpy.array(list(bank)[0:-1])
selected_features = all_features[feat_selector.support_]

print("\nCaracterísticas seleccionadas:")
print(selected_features)

#Comprobar el ranking de características.
print("\nRanking de características:")