def select_top_features(train_data):
    fs = FeatureSelector(train_data[0], train_data[1])
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=6,
                                early_stopping=True)
    fs.identify_low_importance(cumulative_importance=0.99)

    return fs.ops['zero_importance'], fs.ops['low_importance']
Esempio n. 2
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Esempio n. 3
0
def Bestfeature_from_cummulative_importance(inFile, outFile):

    df = pd.read_csv(inFile, sep='\t')
    print(df.shape)
    train_labels = df['class_label']
    train = df.drop(columns=['class_label'])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    zero_importance_features = fs.ops['zero_importance']
    #fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    importance_index = np.min(
        np.where(fs.feature_importances['cumulative_importance'] > 0.99))
    fs.identify_low_importance(cumulative_importance=0.99)
    print(importance_index)
    train_removed_all = fs.remove(methods=['zero_importance'],
                                  keep_one_hot=False)
    train_removed_all = pd.concat([train_removed_all, train_labels], axis=1)
    train_removed_all.to_csv(outFile, sep='\t', index=None)
Esempio n. 4
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Esempio n. 5
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Esempio n. 6
0
def select_best_features(data_file_path, saveto_path="Default"):

    mod_data_file_path = strip_header(data_file_path)

    if saveto_path == "Default":
        saveto_path = replace_ext(data_file_path, '_reduced.csv')

    X = pd.read_csv(mod_data_file_path)
    y = X['Label']
    X = X.drop(columns=['Label'])

    feature_selector = FeatureSelector(data=X, labels=y)
    feature_selector.identify_single_unique()
    feature_selector.identify_collinear(correlation_threshold=0.98)
    feature_selector.identify_zero_importance(task='classification',
                                              eval_metric='auc',
                                              n_iterations=10,
                                              early_stopping=True)
    features_1hot = feature_selector.one_hot_features
    features_base = feature_selector.base_features
    feature_selector.identify_low_importance(cumulative_importance=0.99)

    X_dash = feature_selector.remove(methods=[
        'single_unique', 'collinear', 'zero_importance', 'low_importance'
    ],
                                     keep_one_hot=False)
    X_dash['Label'] = y

    X_dash.to_csv(saveto_path, index=False)

    meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)]
    with open(saveto_path, 'r') as fh:
        contents = fh.read()
    contents = ','.join(meta_data) + '\n' + contents
    with open(saveto_path, 'w') as fh:
        fh.write(contents)

    os.system("rm -f " + mod_data_file_path)
Esempio n. 7
0
print(zero_importance_features[1:15])

fs.plot_feature_importances(threshold=0.99, plot_n=12)
plt.show()

print(fs.feature_importances.head(10))

one_hundred_features = list(fs.feature_importances.loc[:99, 'feature'])
len(one_hundred_features)

#    5. Low Importance Features

# When using this method, we must have already run identify_zero_importance and
# need to pass in a cumulative_importance that accounts for that fraction of total feature importance.

fs.identify_low_importance(cumulative_importance=0.99)

low_importance_features = fs.ops['low_importance']
print(low_importance_features[:5])

fs.plot_feature_importances(threshold=0.99, plot_n=12)
plt.show()

# 6   Removing Features

# Removing Features:    This method returns the resulting data which we can then use for machine learning.
#                       The original data will still be accessible in the data attribute of the Feature Selector.

train_no_missing = fs.remove(methods=['missing'])  #以鉴别17种
train_no_missing_zero = fs.remove(methods=['missing',
                                           'zero_importance'])  #已经鉴别66+17=83种
Esempio n. 8
0
FeatureSelector.identify_feat_imp=identify_feat_imp


# In[28]:


fsDataScale.identify_feat_imp()


# zero feath with 0 

# In[29]:


fsDataScale.identify_low_importance(0.95)


# In[30]:


fsDataScale.plot_feature_importances(plot_n=300)


# In[31]:


max(fsDataScale.scores)


# In[32]:
fc.missing_stats.head()

# In[12]:

fc.identify_collinear(correlation_threshold=0.98)

# In[17]:

fc.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
                            early_stopping=True)

# In[18]:

fc.identify_low_importance(cumulative_importance=0.95)

# In[19]:

fc.identify_single_unique()

# In[30]:

train_removed = fc.remove(methods='all')

# In[31]:

np.save('train_removed2', train_removed.values)

# In[29]: