def prepare_data(): """ This function is the main of this module. calls the above functions in order to read/clean/save our data in usable form. I created this function to use dataset_prepare.py as a Python module in our main program. Return values: training X,Y dataset and testing X,Y dataset """ # read our csv files features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1") training_df = pd.read_csv("training.csv").drop("id",axis=1) testing_df = pd.read_csv("testing.csv").drop("id",axis=1) fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) columnlist = list(training_df) testing_df = testing_df[columnlist] training_df = training_df.sample(frac=1) testing_df = testing_df.sample(frac=1) train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df) training_df = training_df.drop(["attack_cat","label"], axis=1) print("The features we will use are: ", np.array(list(training_df))) return train_x,train_y,test_x,test_y,labels
def test(): train = pd.read_excel('../data/menori.xlsx') train_labels = train["nums"] print(train.head()) train = train.drop(columns=["nums"]) fs = FeatureSelector(data=train, labels=train_labels) fs.identify_collinear(correlation_threshold=0.98) correlated_features = fs.ops['collinear'] print(correlated_features)
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.99, correlation_threshold=1.0) -> pd.DataFrame: fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() if correlation_threshold < 1: fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique', "collinear"]) else: return fs.remove(methods=['missing', 'single_unique'])
def transform_to_nominal(): # read our csv files training_df = pd.read_csv("training.csv").drop("id",axis=1) # Feature selector fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) training_df = training_df.sample(frac=1) training_df = training_df.drop(["attack_cat","label"], axis=1) columnList = list(training_df) labels,nominal_cols = retrieve_classes(training_df) return labels,nominal_cols,columnList
def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.9) train_removed = fs.remove(methods='all') return train_removed
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
def remove_unnecessary_features(self, auto=False): if auto: self.processed_data = self.processed_data.drop( columns=self.predefined_skip_features) else: fs = FeatureSelector(data=self.processed_data.drop("label", axis=1), labels=self.processed_data["label"]) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.98) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.99) fs.identify_single_unique() # Remove the features from all methods (returns a df) labels = self.processed_data["label"] self.processed_data = fs.remove(methods='all') self.processed_data["label"] = labels
def select_best_features(data_file_path, saveto_path="Default"): mod_data_file_path = strip_header(data_file_path) if saveto_path == "Default": saveto_path = replace_ext(data_file_path, '_reduced.csv') X = pd.read_csv(mod_data_file_path) y = X['Label'] X = X.drop(columns=['Label']) feature_selector = FeatureSelector(data=X, labels=y) feature_selector.identify_single_unique() feature_selector.identify_collinear(correlation_threshold=0.98) feature_selector.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) features_1hot = feature_selector.one_hot_features features_base = feature_selector.base_features feature_selector.identify_low_importance(cumulative_importance=0.99) X_dash = feature_selector.remove(methods=[ 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=False) X_dash['Label'] = y X_dash.to_csv(saveto_path, index=False) meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)] with open(saveto_path, 'r') as fh: contents = fh.read() contents = ','.join(meta_data) + '\n' + contents with open(saveto_path, 'w') as fh: fh.write(contents) os.system("rm -f " + mod_data_file_path)
# Features are in train and labels are in train_labels fs = FeatureSelector(data=train, labels=train_labels) #缺失值统计 fs.identify_missing(0.5) df_miss_value = fs.missing_stats.sort_values('missing_fraction', ascending=False) print('df_miss_value', df_miss_value.head(15)) missing_features = fs.ops['missing'] print('missing_features to remove', missing_features[:20]) #单值特征统计 fs.identify_single_unique() print('fs.plot_unique()', fs.plot_unique()) fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear()) # list of collinear features to remove collinear_features = fs.ops['collinear'] print('collinear_features', collinear_features) # dataframe of collinear features df_collinear_features = fs.record_collinear.sort_values('corr_value', ascending=False) print('df_collinear_features', df_collinear_features.head(50)) #零重要度特征统计 # Pass in the appropriate parameters fs.identify_zero_importance(task='classification', eval_metric=tpr_weight_funtion_lc,
test_size=0.2, random_state=0) # Feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns) # Feature selection (remove highly correlated features) from feature_selector import FeatureSelector n = len(X_train.T) fs = FeatureSelector(data=X_train) fs.identify_collinear( correlation_threshold=0.7) # select features from training set corr = fs.ops['collinear'] X_train = fs.remove(methods=['collinear' ]) # remove selected features from training set to_remove = pd.unique( fs.record_collinear['drop_feature']) # features to remove X_test = X_test.drop( columns=to_remove) # remove selected features from test set # Create the artificial neural network import keras from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout num_input_nodes = len(X_train.T)
from feature_selector import FeatureSelector # In[8]: train_labels = train_data['label'] train_features = train_data.drop(columns=[ 'user', 'product_nbr', 'last_year_capture_user_flag', 'label', 'pro_brand_-1', 'pro_brand_Apple', 'pro_brand_三星', 'pro_brand_其他', 'pro_brand_华为', 'pro_brand_小米', 'pro_brand_未知厂商', 'pro_brand_欧珀', 'pro_brand_维沃' ]) # In[14]: fs = FeatureSelector(data=train_features, labels=train_labels) fs.identify_collinear(correlation_threshold=0.9, one_hot=False) # 绘制选择的特征的相关性heatmap fs.plot_collinear() # 列出要删除的共线特征 collinear_features = fs.ops['collinear'] # 查看共线特征的dataframe fs.record_collinear # In[20]: train_data = train_data.drop(columns=collinear_features) # In[21]: train_data.shape
def featureSelect(x, y): fs = FeatureSelector(data=x, labels=y) fs.identify_collinear(correlation_threshold=0.99) choose = fs.ops['collinear'] return choose
test = data_new2[important_features] test["Is_Male"] = y_pred_kmeans #Building ANN now data_ANN = data.copy() data_ANN["Is_Male"] = y_pred_kmeans data_ANN.drop(columns="customer_id", inplace=True) X = data_ANN.iloc[:, :42] y = data_ANN.iloc[:, 42] #Step 1 - Feature selection from feature_selector import FeatureSelector fts = FeatureSelector(X, y) fts.identify_missing(missing_threshold=0.9) fts.identify_collinear(correlation_threshold=0.7) fts.plot_collinear() collinear_features = fts.ops['collinear'] fts.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=30, early_stopping=True) zero_importance_features = fts.ops['zero_importance'] fts.plot_feature_importances(threshold=0.99, plot_n=12) Most_important_Features = list(fts.feature_importances["feature"].head(28)) Data_ANN_2 = data_ANN[Most_important_Features] X = Data_ANN_2.iloc[:, :] y = data_ANN.iloc[:, 42]
In this notebook we will test all the five function, to start we need to create an instance. ***Examples on how to use Feature Selector are in the end of this notebook*** """ fs = FeatureSelector(data = df, labels = df_label) """###Removing Features ***Once we have identified the features to remove, we have a number of ways to drop the features. We can access any of the feature lists in the removal_ops dictionary and remove the columns manually. We also can use the remove method, passing in the methods that identified the features we want to remove*** Now, we will apply collinear, zero importance, low importance and sigle unique feature importance to selecti which columns to remove """ fs.identify_collinear(correlation_threshold=0.975) fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) fs.identify_low_importance(cumulative_importance = 0.99) fs.identify_single_unique() to_remove = fs.check_removal() feature_df = fs.remove( methods = ['collinear', 'zero_importance', 'low_importance', 'single_unique'], keep_one_hot=False )
fs = FeatureSelector(data = df_feats, labels = train_labels) #-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD) #-- Single unique value fs.identify_single_unique() #-- TO get keys fs.ops.keys() missing_features = list(fs.ops['missing']) corelated_features = list(fs.ops['collinear']) single_value = list(fs.ops['single_unique']) r = set(flatten([missing_features,corelated_features,single_value])) #X = df_feats.drop(r, axis=1) rnk_pval = getPvalStats(df, 'target')
dfm__ = dfm_.reset_index().set_index(['date', 'symbol']) dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64) dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn'] dfm__ = dfm__.dropna() dfm__.isna().sum() df_mrq['win'] = dfm__.win df_mrq['trt1m'] = dfm__.trt1m df_mrq['sprtrn'] = dfm__.sprtrn df_mrq['rtoversp'] = dfm__.rtoversp df_mrq = df_mrq.dropna() train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp']) train_labels = df_mrq['win'] fs = FeatureSelector(data=train, labels=train_labels) fs.identify_collinear(correlation_threshold=0.975) #fs.plot_collinear(plot_all=True) #fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True) #fs.identify_low_importance(cumulative_importance = 0.99) all_to_remove = fs.check_removal() print(all_to_remove) df_mrq_pruned = df_mrq.drop(columns=all_to_remove) # df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')
# In[27]: fc = FeatureSelector(data, labels=label) # In[10]: fc.identify_missing(missing_threshold=0.95) # In[11]: fc.missing_stats.head() # In[12]: fc.identify_collinear(correlation_threshold=0.98) # In[17]: fc.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) # In[18]: fc.identify_low_importance(cumulative_importance=0.95) # In[19]: fc.identify_single_unique()