Esempio n. 1
0
def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0):
    if show > 0:
        print('Feature Selection - ' + bcl[0] +
              ':  - number of features reducing from ' + str(X.shape[1]) +
              ' to ' + str(m) + ' ...')
    if bcl[0] == 'Fisher':
        sel = sfsfisher(X, d, m)
    else:
        estimator = defineModel(bcl)
        sfs = SFS(estimator,
                  k_features=m,
                  forward=True,
                  floating=False,
                  verbose=show,
                  scoring='accuracy',
                  cv=cv)
        sfs = sfs.fit(X, d)
        sel = list(sfs.k_feature_idx_)
        if show > 0:
            print(' ')
        if show:
            plot_sfs(sfs.get_metric_dict(), kind='std_err')
            plt.title('Sequential Forward Selection')
            plt.grid()
            plt.show()
    return sel
    def score_of_SFS(self, model=None):
        # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。
        # 展示:随着特征个数增加得分变化趋势图
        # 可选择K个最优特征
        selector = SFS(model,
                       k_features=self.K,
                       forward=True,
                       floating=False,
                       # scoring='neg_mean_squared_error',
                       scoring=self.score,
                       cv=0)
        selector.fit(self.train_X, self.train_y)

        features_idx = []
        for k, v in selector.get_metric_dict().items():
            for f in v['feature_idx']:
                if f not in features_idx:
                    # 按顺序取出重要性最高的特征
                    features_idx.append(f)

        sort_num = []
        for f in self.continuous_feature_names:
            i = features_idx.index(f)+1
            sort_num.append(i)

        sc = [1/x for x in sort_num]
        sum_sc = sum(sc)
        featureScore = [round(s/sum_sc, 4) for s in sc]

        model_name = str(model).split('(')[0]
        print(model_name + ' by SFS is finished')
        return featureScore
def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath,
                     test_filepath):
    logger.info('FEATURE SELECTION...')

    if bool(config.params.clean_experiment_directory_before_training
            ) and os.path.isdir(config.params.experiment_dir):
        logger.info('Cleaning experiment directory...')
        shutil.rmtree(config.params.experiment_dir)

    data = _read_data(data_dev_mode, train_filepath, test_filepath)

    train_set = data['train']

    y = train_set[config.TARGET_COL].values.reshape(-1, )
    train_set = train_set.drop(columns=config.TARGET_COL)

    pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG,
                                        suffix=tag)

    sfs = SequentialFeatureSelector(estimator=pipeline,
                                    k_features=(10, len(train_set.columns)),
                                    forward=False,
                                    verbose=2,
                                    cv=5,
                                    scoring='roc_auc')
    sfs.fit(train_set.to_numpy(), y)

    fig = plot_sequential_feature_selection(sfs.get_metric_dict())
    plt.ylim([0.6, 1])
    plt.title('Sequential Feature Selection')
    plt.grid()
    plt.show()
class DFSequentialFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.selector = SequentialFeatureSelector(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.selector.fit(X[self.transform_cols], y)

        self.stat_df = pd.DataFrame.from_dict(
            self.selector.get_metric_dict()).T
        self.stat_df.at[self.stat_df['avg_score'].astype(float).idxmax(),
                        'support'] = True
        self.stat_df['support'].fillna(False, inplace=True)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        features = list(
            self.stat_df[self.stat_df['support']]['feature_names'].values[0])
        new_X = X[features].copy()

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
Esempio n. 5
0
def get_best_logisitc(y):
    
  from mlxtend.feature_selection import SequentialFeatureSelector as SFS
  from sklearn.cross_validation import StratifiedKFold
  import pandas as pd
  from sklearn.linear_model import LogisticRegression
  from sklearn.cross_validation import cross_val_score
  
  my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8')
  
  y = my_data.target
  my_data = my_data.drop('target', axis=1)
  
    
  # To have better CV
  skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False)

  C_params = [0.01 , 1, 10, 50, 70, 100]
  solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag']

  my_result_list = []
  for C_param in C_params:
      for solver in solvers:
          print "Looking for C : %s and solver : %s" % (C_param, solver)
          model = LogisticRegression(class_weight='balanced', random_state=17, 
                                     solver=solver, C=C_param)
          sfs = SFS(model, 
                    k_features=len(my_data.columns), 
                    forward=True, 
                    floating=False, 
                    scoring='roc_auc',
                    print_progress=False,
                    cv=skf,
                    n_jobs=-1)
          
          sfs = sfs.fit(my_data.values, y.values)

          result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
          result_sfs.sort_values('avg_score', ascending=0, inplace=True)
          features_sfs = result_sfs.feature_idx.head(1).tolist()
          select_features_sfs = list(my_data.columns[features_sfs])

          scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc')
          my_result_list.append({'C' : C_param,
                               'solver' : solver,
                               'auc' : scores.mean(),
                               'std' : scores.std(),
                               'best_columns' : select_features_sfs,
                               'estimator' : model})

  my_result = pd.DataFrame(my_result_list)
  my_result.sort_values('auc', ascending=0, inplace=True)

  best_features = my_result.best_columns.head(1).values[0]
  best_model = my_result.estimator.head(1).values[0]

  return best_features, best_model
 def figs_of_SFS(self, model=None):
     selector = SFS(model,
                    k_features=self.K,
                    forward=True,
                    floating=False,
                    # scoring='neg_mean_squared_error',
                    cv=0)
     selector.fit(self.train_X, self.train_y)
     model_name = str(model).split('(')[0]
     fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev')
     plt.title('SFS of {}'.format(model_name))
     plt.grid()
     plt.show()
Esempio n. 7
0
def test_randomholdoutsplit_in_sfs():
    h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123)
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
Esempio n. 8
0
def plot_feed_forward_models():
    """
    Plots the performance for each iteration of the feedforward model.
    The number of features chosen are 15 and 20, since these showed the best result

    """
    # create Linear Regression model
    regr = LinearRegression()

    sfs_model = SequentialFeatureSelector(regr,
                                          k_features=15,
                                          forward=True,
                                          floating=False,
                                          scoring='neg_mean_squared_error',
                                          cv=10)

    sfs_model = sfs_model.fit(X_train, y_train)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Linear Regression (w. StdErr)')
    plt.grid()
    plt.show()

    # Same for the Decision Tree, with some different settings
    clf = tree.DecisionTreeClassifier()

    sfs_model = SequentialFeatureSelector(clf,
                                          k_features=20,
                                          forward=True,
                                          floating=False,
                                          scoring='accuracy',
                                          cv=10)
    sfs_model = sfs_model.fit(X_train, y_train_binned)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Decision Tree (w. StdErr)')
    plt.grid()
    plt.show()
Esempio n. 9
0
def test_predefinedholdoutsplit_in_sfs():
    h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99])
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
def sequential_feature_selection(data_set, y_values, want_graph):
    lr = LinearRegression()
    sfs = SFS(lr,
              k_features=13,
              forward=True,
              floating=False,
              scoring='neg_mean_squared_error',
              cv=10)
    sfs = sfs.fit(data_set, y_values)
    if want_graph:
        fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()

    return sfs
def perform_sfs(curr_classifier, X_train, X_test, y_train, y_test):
    sfs1 = SFS(curr_classifier,
               k_features=100,
               verbose=0,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=5,
               n_jobs=8)

    sfs1 = sfs1.fit(X_train, y_train)
    df = pd.DataFrame.from_dict(sfs1.get_metric_dict(), orient='index')
    df[[
        'accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix'
    ]] = df['feature_idx'].apply(lambda x: get_test_score(
        X_train, X_test, y_train, y_test, x, curr_classifier)).apply(pd.Series)

    return df
Esempio n. 12
0
def fse_sfs(bcl, X, d, m, cv=0, show=0):
    estimator = defineModel(bcl)
    sfs = SFS(estimator,
              k_features=m,
              forward=True,
              floating=False,
              verbose=2,
              scoring='accuracy',
              cv=cv)
    sfs = sfs.fit(X, d)
    sel = sfs.k_feature_idx_
    print(' ')
    if show:
        plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()
    return sel
def feature_selection(regr, train):
    x, y = train.drop(columns=['Id', 'SalePrice']), train['SalePrice']

    regr.fit(x, y)

    sfs = SFS(regr, k_features=x.shape[1] - 10, forward=False, verbose=2,
              scoring='neg_mean_squared_error', cv=4)
    sfs.fit(x, y)
    selected_features = (pd.DataFrame(sfs.get_metric_dict())
                         .T
                         .loc[:, ['feature_names', 'avg_score', 'std_dev', 'std_err']]
                         .sort_values(['avg_score', 'std_dev'], ascending=False)
                         .reset_index(drop=True))

    best_features = selected_features.at[0, 'feature_names']
    best_features = list(best_features)
    bad_features = [f for f in x if f not in best_features]

    return bad_features
def FSRCV(X_train, y_train, forward=True, cv=10):
    """
    Sequential Feature Selector from mlxtend package, used to
    implement a brute-force forward/backward selection.

    https://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/.
    They also have a GridSearchCV function.

    Add/remove the feature that reports the best/worst MSE score
    in the model and run cross validation (default 10-fold).

    Args:
        X_train: numpy
        Y_train: numpy
        forward: FSR, iteratively adding features
        cv: default=10
    Return:
        sfs1: model
        cv_scores: tuple (mean, std error)
        X_train_sfs: the new subsets based on the selected features
    """
    # FSR
    estimator = LinearRegression()
    sfs1 = SFS(estimator,
               k_features=X_train.shape[1],
               forward=forward,
               floating=False,
               verbose=2,
               scoring='neg_mean_absolute_error',
               cv=cv)
    sfs1 = sfs1.fit(X_train, y_train)
    X_train_sfs = sfs1.tranform(X_train)  # selected best

    # get cv scores
    fsr_results = sfs1.get_metric_dict()
    fsr = pd.DataFrame.from_dict(fsr_results).T

    score = fsr.avg_score
    std = fsr.std_err
    cv_scores = [x for x in zip(score, std)]

    return sfs1, cv_scores, X_train_sfs
    def select_by_SFS(self, model=None):
        # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。
        # 展示:随着特征个数增加得分变化趋势图
        # 可选择K个最优特征
        selector = SFS(model,
                  k_features=self.K,
                  forward=True,
                  floating=False,
                  # scoring='neg_mean_squared_error',
                  cv=0)
        selector.fit(self.train_X, self.train_y)
        k_feature = selector.k_feature_names_
        print('selected features:', k_feature)
        print('selected index:', selector.k_feature_idx_)

        if self.showFig:
            model_name = str(model).split('(')[0]
            plot_sfs(selector.get_metric_dict(), kind='std_dev')
            plt.title('SFS of {}'.format(model_name))
            plt.grid()
            plt.show()
Esempio n. 16
0
x_scaled_np = StandardScaler().fit_transform(x_data)
x_scaled_np = PolynomialFeatures(degree=2).fit_transform(x_scaled_np)

print(y)
print(x_scaled_np)

cv = RepeatedKFold(n_splits=5, n_repeats=20)

bins = np.linspace(y.min(), y.max(), 5)
labels = ["1", "2", "3", "4"]
Y_groups = pd.cut(y, bins)

sfs = SFS(regr, floating=True, verbose=2,
          k_features=2, forward=False,
          n_jobs=2,
          scoring='neg_mean_absolute_error', cv=cv)

sfs.fit(x_scaled_np, y)

print("Optimal number of features : %d" % sfs.k_features)
print('Best features :', sfs.k_feature_names_)
print('Best score :', sfs.k_score_)
print(sfs.get_params())
print(sfs)

fig1 = plot_sfs(sfs.get_metric_dict(),
                kind='std_dev',
                figsize=(6, 4))
plt.show()
Esempio n. 17
0

cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1)
sfs_ridge_forward= SFS(Ridge(alpha=0.1),
          k_features=4,
          forward=True,
          floating=True,
          scoring = 'neg_mean_squared_error',
          verbose=2,
          cv = cv)
sfs_ridge_forward.fit(X_norm, y)
sfs_ridge_forward.k_feature_names_



fig1 = plot_sfs(sfs_ridge_forward.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.ylabel('Perfomance')
plt.grid()
plt.savefig("forward_processing_Porperty_ridge_"+name+".png", dpi=300)
plt.show()


X_selcted_columns= list(sfs_ridge_forward.k_feature_names_)
X_selected=X_norm[X_selcted_columns]
ridge=Ridge()
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1)
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=cv)
result_ridge= ridge_regressor.fit(X_selected,y)
Esempio n. 18
0
nval = test[data_type=='validation'].shape[0]
split = trainN - nval
cv = args.cv or [(range(split),range(split,trainN))]

print 'original train_err:\n', pd.concat((train.apply(lambda x:log_loss(train_y,x)),pd.Series(range(train.shape[1]),name='#',index=train.columns)),1)
print 'simple av validation error:', log_loss(train_y.iloc[split:],train.iloc[split:].mean(1))
Cs=10.**np.array(np.linspace(-4,5,50))
k_features=(1,min(train.shape[1],args.max))
  
lr=LogisticRegressionCV(Cs=Cs,fit_intercept=True)
# lr.fit(train,train_y)
if args.fs=='sfs': fs=SFS(lr,k_features=k_features,forward=True,floating=True,scoring='neg_log_loss',cv=cv,verbose=2)
else:              fs=EFS(lr,min_features=1,max_features=min(train.shape[1],8),scoring='neg_log_loss',cv=cv)
fs.fit(train.values,train_y.values)
print
print pd.DataFrame.from_dict(fs.get_metric_dict()).T
if args.fs=='sfs':
  print 'SFS best score:', fs.k_score_
  print len(fs.k_feature_idx_),'features:',fs.k_feature_idx_
else:
  print 'EFS best score:', fs.best_score_
  print len(fs.best_idx_),'features:',fs.best_idx_
  
lr.fit(fs.transform(train.iloc[:split].values),train_y.iloc[:split])
print
print 'Regularization C:', lr.C_
print 'validation error fitting on train:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1])

lr.fit(fs.transform(train.iloc[split:].values),train_y.iloc[split:])
print 'validation error fitting on val:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1])
if args.out:
Esempio n. 19
0
         forward=False,
         floating=False,
        scoring='r2',
         cv=0)
sbs.fit(x_train, y_train)
sbs.k_feature_names_

sfs1 = SFS(LinearRegression(),
         k_features=(1,7),
         forward=True,
         floating=False,
         cv=0)
sfs1.fit(x_train, y_train)
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

dtree = DecisionTreeRegressor()
model = dtree.fit(x_train, y_train)  #train parameters: features and target
dtree_pred = dtree.predict(x_test)

#Visualize the minimal error classification tree
#export graphviz doesn't work in Jupyter Notebook - COMMENT OUT IF USING JUPYTER NOTEBOOK 
from sklearn.externals.six import StringIO  #use to visualize and classification tree 
from IPython.display import Image  #use to visualize and classification tree
import pydotplus #use to visualize and classification tree
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,   
Esempio n. 20
0

# **Best subset of Features selected after feature selection process.**

# In[199]:


f_selector.k_feature_names_


# **Plot of Number of Features v/s Performance of Regressor.**

# In[200]:


plot_sfs(f_selector.get_metric_dict(),kind='std_dev')


# Selecting the best subset of features and removing others from X_Train.

# In[201]:


feat_random_forest=list(f_selector.k_feature_names_)
X_train_rf=X_train_rf.loc[:,list(f_selector.k_feature_names_)]


# Using GridSearchCV for Hyperparameter Tuning.

# In[206]:
Esempio n. 21
0
#the auc of random forest is: 0.8112633992853715

f_number = 50
sfs4 = SFS(clfRandomForest,
           k_features=f_number,
           forward=True,
           floating=False,
           scoring='roc_auc',
           cv=5)

result4 = sfs4.fit(X_train, y_train, custom_feature_names=feature_names)
#print(X)
result4.subsets_
result4.k_score_

selection_res = pd.DataFrame.from_dict(sfs4.get_metric_dict()).T
# print(selection_res)
selection_res.to_csv(
    "/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withistoryrf.csv",
    sep='\t')

selected_feature_idx = result4.k_feature_idx_
#print(type(selected_feature_idx))
selected_feature = list(selected_feature_idx)
feature_name = []
for i in selected_feature:
    feature_name.append(feature_names[i])
print(feature_name)

fig = plot_sfs(sfs4.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
Esempio n. 22
0
y_pred = model.predict(X_test_scoring)
predictions = [round(value) for value in y_pred]
IG_Test_accuracy = accuracy_score(y_test_scoring, predictions)
print('Info Gain Accuracy (Test, Hold-Out): %.2f%%' % (Baseline_Test_accuracy * 100.0))

# WRAPPER-BASED FORWARD SEQUENTIAL SEARCH
#The Forward Seqeuntial Search will use Gradient Boost classifier and look at all the features added sequentially. Then, re-evaluate using the least amount of features which give the best accuracy.

# It doesn't appear to add any value past ~7 features, so change k_features to 7 if this runs slowly
sfs_forward = SFS(model,k_features=44,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1)
sfs_forward = sfs_forward.fit(X_train, y_train)

# This will create a graphic that shows performance (accuracy) as a solid blue line for each feature added, 
# and the feint blue is the standard error for that feature
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs_forward.get_metric_dict(), kind='std_dev', figsize=(10, 5))
plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (Standard Error)')
plt.grid()
plt.show()

"""
    DISCUSSION
From the graph above, it would appear the 7 features would be the best model, after that the model performance again plateau's like with Information Gain. We will re-run the model using the 7 best features.
"""

# Rerun with 7 features
sfs_forward = SFS(model,k_features=7,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1)
sfs_forward = sfs_forward.fit(X_train, y_train)

# Get the 7 features used
            1, 64
        ),  #(1,64) # SFS will consider return any feature combination between min and max that scored highest in cross-validtion
        forward=FLAGS.FORWARD,  # forward or backward
        floating=FLAGS.FLOATING,  # put back?
        verbose=0,
        scoring='accuracy',  #'neg_mean_squared_error',
        cv=5)
    sfs = sfs.fit(X, y)

    best_feature_index = sfs.k_feature_idx_
    best_feature_name = [feature_names[i] for i in best_feature_index]
    print("The number of best features is:", len(best_feature_index))
    print("The best features' index are:", best_feature_index)
    print("The best features are:", best_feature_name)

    fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
    config = {(True,True):('FORWARD','FLOATING'),\
        (True,False):('FORWARD','NFLOATING'),\
        (False,True):('BACKWARD','FLOATING'),\
        (False,False):('BACKWARD','NFLOATING'),}
    fig.savefig('feature_selection/SINGLE-' +
                config[(FLAGS.FORWARD, FLAGS.FLOATING)][0] + '-' +
                config[(FLAGS.FORWARD, FLAGS.FLOATING)][1])

else:
    config = {(True,True):('FORWARD','FLOATING'),\
    (True,False):('FORWARD','NFLOATING'),\
    (False,True):('BACKWARD','FLOATING'),\
    (False,False):('BACKWARD','NFLOATING'),}

    best_feature_index_array = config.copy()
Esempio n. 24
0
Y = pd.read_csv('original_data/y_train.csv', names=['target'], delimiter=';')

estimator = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=100)

sfs1 = SFS(estimator,
           k_features=(10, 40),
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5,
           n_jobs=4)

sfs1 = sfs1.fit(X[X.columns].as_matrix(), Y['target'].values)

results = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

# (96, 97, 98, 131, 200, 138, 11, 76, 115, 83, 212, 182, 187, 156)
# 0.642879680873

# (96, 98, 131, 138, 11, 76, 43, 209, 115, 182, 29)
# 0.638581676053


print(sfs1.subsets_)
print(sfs1.k_feature_idx_)
print(sfs1.k_score_)
Esempio n. 25
0
def get_best_logisitc(y):

    from mlxtend.feature_selection import SequentialFeatureSelector as SFS
    from sklearn.cross_validation import StratifiedKFold
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import cross_val_score

    my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8')

    y = my_data.target
    my_data = my_data.drop('target', axis=1)

    # To have better CV
    skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False)

    C_params = [0.01, 1, 10, 50, 70, 100]
    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag']

    my_result_list = []
    for C_param in C_params:
        for solver in solvers:
            print "Looking for C : %s and solver : %s" % (C_param, solver)
            model = LogisticRegression(class_weight='balanced',
                                       random_state=17,
                                       solver=solver,
                                       C=C_param)
            sfs = SFS(model,
                      k_features=len(my_data.columns),
                      forward=True,
                      floating=False,
                      scoring='roc_auc',
                      print_progress=False,
                      cv=skf,
                      n_jobs=-1)

            sfs = sfs.fit(my_data.values, y.values)

            result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
            result_sfs.sort_values('avg_score', ascending=0, inplace=True)
            features_sfs = result_sfs.feature_idx.head(1).tolist()
            select_features_sfs = list(my_data.columns[features_sfs])

            scores = cross_val_score(model,
                                     my_data[select_features_sfs],
                                     y,
                                     cv=skf,
                                     scoring='roc_auc')
            my_result_list.append({
                'C': C_param,
                'solver': solver,
                'auc': scores.mean(),
                'std': scores.std(),
                'best_columns': select_features_sfs,
                'estimator': model
            })

    my_result = pd.DataFrame(my_result_list)
    my_result.sort_values('auc', ascending=0, inplace=True)

    best_features = my_result.best_columns.head(1).values[0]
    best_model = my_result.estimator.head(1).values[0]

    return best_features, best_model
Esempio n. 26
0
#print("Features selected in forward fit")
#print(x.columns[b])

#%% FORWARD FIT - Sequential Search (SFS)
sfs_f = SFS(
    lr,
    k_features=(1, predictors.shape[1]),
    forward=True,  # Forward fit
    floating=False,
    scoring='neg_mean_squared_error',
    cv=5)

# Fit this on the data
sfs_f = sfs_f.fit(x.values, y.values)
# Get all the details of the forward fits
a = sfs_f.get_metric_dict()
n = []
o = []

# Compute the mean cross validation scores
for i in np.arange(1, predictors.shape[1]):
    n.append(-np.mean(a[i]['cv_scores']))
m = np.arange(1, predictors.shape[1])

# Plot the CV scores vs the number of features
fig1 = plt.plot(m, n, label='SFS_f')
plt.title('Mean CV Scores vs N# of features')
plt.xlabel('N# features')
plt.ylabel('MSE')

# Forward steps with Cross-Validation
Esempio n. 27
0
knn = KNeighborsClassifier(n_neighbors=4)
lr = LinearRegression()
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

lr = LinearRegression()

sfs = SFS(lr,
          k_features=13,
          forward=True,
          floating=False,
          scoring='neg_mean_squared_error',
          cv=10)

sfs = sfs.fit(X, Y)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

# In[148]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#Funded_amt_inv
#Term
#Grade
#Subgrade
#Dti
#Delinq_2_yrs
Esempio n. 28
0
def sequential_feature_selector(features, labels, classifier, k_features, kfold, selection_type, plot=True, **kwargs):
    """Sequential feature selection to reduce the number of features.

    The function reduces a d-dimensional feature space to a k-dimensional
    feature space by sequential feature selection. The features are selected
    using ``mlxtend.feature_selection.SequentialFeatureSelection`` which
    essentially selects or removes a feature from the d-dimensional input space
    until the preferred size is reached.

    The function will pass ``ftype='feature'`` and forward ``features`` on to a
    classifier's ``static_opts`` method.

    Args:
        features: The original d-dimensional feature space
        labels: corresponding labels
        classifier (str or object): The classifier which should be used for
            feature selection. This can be either a string (name of a classifier
            known to gumpy) or an instance of a classifier which adheres
            to the sklearn classifier interface.
        k_features (int): Number of features to select
        kfold (int): k-fold cross validation
        selection_type (str): One of ``SFS`` (Sequential Forward Selection),
            ``SBS`` (Sequential Backward Selection), ``SFFS`` (Sequential Forward
            Floating Selection), ``SBFS`` (Sequential Backward Floating Selection)
        plot (bool): Plot the results of the dimensinality reduction
        **kwargs: Additional keyword arguments that will be passed to the
            Classifier instantiation

    Returns:
        A 3-element tuple containing

        - **feature index**: Index of features in the remaining set
        - **cv_scores**: cross validation scores during classification
        - **algorithm**: Algorithm that was used for search

    """

    # retrieve the appropriate classifier
    if isinstance(classifier, str):
        if not (classifier in available_classifiers):
            raise ClassifierError("Unknown classifier {c}".format(c=classifier.__repr__()))

        kwopts = kwargs.pop('opts', dict())
        # opts = dict()

        # retrieve the options that we need to forward to the classifier
        # TODO: should we forward all arguments to sequential_feature_selector ?
        opts = available_classifiers[classifier].static_opts('sequential_feature_selector', features=features)
        opts.update(kwopts)

        # XXX: now merged into the static_opts invocation. TODO: test
        # if classifier == 'SVM':
        #     opts['cross_validation'] = kwopts.pop('cross_validation', False)
        # elif classifier == 'RandomForest':
        #     opts['cross_validation'] = kwopts.pop('cross_validation', False)
        # elif classifier == 'MLP':
        #     # TODO: check if the dimensions are correct here
        #     opts['hidden_layer_sizes'] = (features.shape[1], features.shape[2])
        # get all additional entries for the options
        # opts.update(kwopts)

        # retrieve a classifier object
        classifier_obj = available_classifiers[classifier](**opts)

        # extract the backend classifier
        clf = classifier_obj.clf
    else:
        # if we received a classifier object we'll just use this one
        clf = classifier.clf


    if selection_type == 'SFS':
        algorithm = "Sequential Forward Selection (SFS)"
        sfs = SFS(clf, k_features, forward=True, floating=False,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SBS':
        algorithm = "Sequential Backward Selection (SBS)"
        sfs = SFS(clf, k_features, forward=False, floating=False,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SFFS':
        algorithm = "Sequential Forward Floating Selection (SFFS)"
        sfs = SFS(clf, k_features, forward=True, floating=True,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SBFS':
        algorithm = "Sequential Backward Floating Selection (SFFS)"
        sfs = SFS(clf, k_features, forward=True, floating=True,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    else:
        raise Exception("Unknown selection type '{}'".format(selection_type))


    pipe = make_pipeline(StandardScaler(), sfs)
    pipe.fit(features, labels)
    subsets = sfs.subsets_
    feature_idx = sfs.k_feature_idx_
    cv_scores = sfs.k_score_

    if plot:
        fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
        plt.ylim([0.5, 1])
        plt.title(algorithm)
        plt.grid()
        plt.show()

    return feature_idx, cv_scores, algorithm, sfs, clf
Esempio n. 29
0
logreg = linear_model.LogisticRegression()

sfs = SFS(logreg, 
          k_features=30, 
          forward=True, 
          floating=False, 
          scoring='roc_auc',
          cv=4)

sfs = sfs.fit(X, y)
print('\nSequential Floating Forward Selection (k=30):')
print(sfs.k_feature_idx_)
print('CV Score:')
print(sfs.k_score_)

pd.DataFrame.from_dict(sfs.get_metric_dict()).T

plt.figure(figsize=(19,10))
fig = plot_sfs(sfs.get_metric_dict(), kind=None)
plt.title('Sequential Forward Selection (rocauc)')
plt.grid()
plt.show()


# In[7]:

idxs_selected=sfs.k_feature_idx_
featureindex = []
for i in idxs_selected:
    featureindex.append(i)
featuredataframe=df.iloc[:,1:134]
Esempio n. 30
0
        dic[i] = rfe.score()
    plt.xlabel('feature_num')
    plt.ylabel('score')
    plt.plot(dic.keys(), dic.values())
    plt.show()
    return dic


if __name__ == "__main__":
    train_data = load_data(train_url)
    train_y = train_data['price']
    train_data.drop(['SaleID'], axis=1, inplace=True)
    train_data.drop(['price'], axis=1, inplace=True)
    col_name = [
        'name', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
        'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_3', 'v_12',
        'usedTime'
    ]
    sfs = SFS(LinearRegression(),
              k_features=13,
              forward=True,
              floating=False,
              scoring='r2',
              cv=0)
    train_data = train_data.fillna(0)
    sfs.fit(train_data, train_y)
    print(sfs.k_feature_names_)
    print(pd.DataFrame.from_dict(sfs.get_metric_dict()).T)
    fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
    plt.grid()
    plt.show()
    df_sim = comp.load_dataframe(datatype='sim', config=args.config)

    # Extract training features and targets from simulation DataFrame
    feature_list, feature_labels = comp.get_training_features()
    X_train_sim, X_test_sim, y_train_sim, y_test_sim, le = comp.get_train_test_sets(
        df_sim, feature_list, comp_class=True)

    # Load pipeline to use
    pipeline = comp.get_pipeline(args.pipeline)

    k_features = X_train_sim.shape[1] if args.method == 'forward' else 1

    # Set up sequential feature selection algorithm
    sfs = SFS(pipeline,
              k_features=k_features,
              forward=True if args.method == 'forward' else False,
              floating=args.floating,
              scoring=args.scoring,
              print_progress=True,
              cv=args.cv,
              n_jobs=args.n_jobs)
    # Run algorithm
    sfs = sfs.fit(X_train_sim, y_train_sim)

    # Get DataFrame of sfs results
    results_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
    # Save DataFrame to csv file
    output_file = 'SFS-results/{}_{}_{}_{}_cv{}.csv'.format(args.pipeline, args.method,
        'floating' if args.floating else 'nofloat', args.scoring, args.cv)
    results_df.to_csv(output_file)