def select_features_without_label(features: pd.DataFrame, missing_threshold=0.7, correlation_threshold=0.95) -> pd.DataFrame: fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() # fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique'])
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.90, correlation_threshold=1) -> pd.DataFrame: print(missing_threshold) fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() if correlation_threshold < 1: fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique', "collinear"]) else: return fs.remove(methods=['missing', 'single_unique'])
def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.9) train_removed = fs.remove(methods='all') return train_removed
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
def remove_unnecessary_features(self, auto=False): if auto: self.processed_data = self.processed_data.drop( columns=self.predefined_skip_features) else: fs = FeatureSelector(data=self.processed_data.drop("label", axis=1), labels=self.processed_data["label"]) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.98) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.99) fs.identify_single_unique() # Remove the features from all methods (returns a df) labels = self.processed_data["label"] self.processed_data = fs.remove(methods='all') self.processed_data["label"] = labels
#-- Create an instance fs = FeatureSelector(data = df_feats, labels = train_labels) #-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD) #-- Single unique value fs.identify_single_unique() #-- TO get keys fs.ops.keys() missing_features = list(fs.ops['missing']) corelated_features = list(fs.ops['collinear']) single_value = list(fs.ops['single_unique']) r = set(flatten([missing_features,corelated_features,single_value])) #X = df_feats.drop(r, axis=1)
#pd.head,默认查看前五行数据。因为train已经是pd因此直接.head即可 print(train.head()) #pandas.drop的用法,删除Target列 train = train.drop(['TARGET'], axis=1) #对于pandas,行标为index,列表为columns #如常用df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three']) #Create the Instance fs = FeatureSelector(data=train, labels=train_labels) # 1 Missing Values fs.identify_missing(missing_threshold=0.6) #The features identified for removal can be accessed through the ops dictionary of the FeatureSelector object. missing_features = fs.ops['missing'] print(missing_features[:20]) fs.plot_missing() #在每一个画图的后面加上plt.show即可 plt.show() print(fs.missing_stats.head(20)) # 2 Single Unique Value fs.identify_single_unique() single_unique = fs.ops['single_unique'] print(single_unique)
test = pd.read_csv(path0 + 'test.csv') train = pd.read_csv(path0 + 'train.csv') print('tag_value_counts', train['Tag'].value_counts()) train_labels = train['Tag'] train = train.drop(['UID', 'Tag'], axis=1) X_loc_test = test.drop('UID', axis=1) from feature_selector import FeatureSelector # Features are in train and labels are in train_labels fs = FeatureSelector(data=train, labels=train_labels) #缺失值统计 fs.identify_missing(0.5) df_miss_value = fs.missing_stats.sort_values('missing_fraction', ascending=False) print('df_miss_value', df_miss_value.head(15)) missing_features = fs.ops['missing'] print('missing_features to remove', missing_features[:20]) #单值特征统计 fs.identify_single_unique() print('fs.plot_unique()', fs.plot_unique()) fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear()) # list of collinear features to remove collinear_features = fs.ops['collinear']
# Evaluate the K-Means clustering results data_new2["Is_Male"] = y_pred_kmeans test = data_new2[important_features] test["Is_Male"] = y_pred_kmeans #Building ANN now data_ANN = data.copy() data_ANN["Is_Male"] = y_pred_kmeans data_ANN.drop(columns="customer_id", inplace=True) X = data_ANN.iloc[:, :42] y = data_ANN.iloc[:, 42] #Step 1 - Feature selection from feature_selector import FeatureSelector fts = FeatureSelector(X, y) fts.identify_missing(missing_threshold=0.9) fts.identify_collinear(correlation_threshold=0.7) fts.plot_collinear() collinear_features = fts.ops['collinear'] fts.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=30, early_stopping=True) zero_importance_features = fts.ops['zero_importance'] fts.plot_feature_importances(threshold=0.99, plot_n=12) Most_important_Features = list(fts.feature_importances["feature"].head(28)) Data_ANN_2 = data_ANN[Most_important_Features]
# In[7]: from feature_selector import FeatureSelector # In[8]: data = pd.DataFrame(data_np) # In[27]: fc = FeatureSelector(data, labels=label) # In[10]: fc.identify_missing(missing_threshold=0.95) # In[11]: fc.missing_stats.head() # In[12]: fc.identify_collinear(correlation_threshold=0.98) # In[17]: fc.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True)