def select_features_without_label(features: pd.DataFrame, missing_threshold=0.99, correlation_threshold=1.0) -> pd.DataFrame: fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() if correlation_threshold < 1: fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique', "collinear"]) else: return fs.remove(methods=['missing', 'single_unique'])
def prepare_data(): """ This function is the main of this module. calls the above functions in order to read/clean/save our data in usable form. I created this function to use dataset_prepare.py as a Python module in our main program. Return values: training X,Y dataset and testing X,Y dataset """ # read our csv files features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1") training_df = pd.read_csv("training.csv").drop("id",axis=1) testing_df = pd.read_csv("testing.csv").drop("id",axis=1) fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) columnlist = list(training_df) testing_df = testing_df[columnlist] training_df = training_df.sample(frac=1) testing_df = testing_df.sample(frac=1) train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df) training_df = training_df.drop(["attack_cat","label"], axis=1) print("The features we will use are: ", np.array(list(training_df))) return train_x,train_y,test_x,test_y,labels
def transform_to_nominal(): # read our csv files training_df = pd.read_csv("training.csv").drop("id",axis=1) # Feature selector fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) training_df = training_df.sample(frac=1) training_df = training_df.drop(["attack_cat","label"], axis=1) columnList = list(training_df) labels,nominal_cols = retrieve_classes(training_df) return labels,nominal_cols,columnList
def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.9) train_removed = fs.remove(methods='all') return train_removed
def Bestfeature_from_cummulative_importance(inFile, outFile): df = pd.read_csv(inFile, sep='\t') print(df.shape) train_labels = df['class_label'] train = df.drop(columns=['class_label']) fs = FeatureSelector(data=train, labels=train_labels) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) zero_importance_features = fs.ops['zero_importance'] #fs.plot_feature_importances(threshold = 0.99, plot_n = 12) importance_index = np.min( np.where(fs.feature_importances['cumulative_importance'] > 0.99)) fs.identify_low_importance(cumulative_importance=0.99) print(importance_index) train_removed_all = fs.remove(methods=['zero_importance'], keep_one_hot=False) train_removed_all = pd.concat([train_removed_all, train_labels], axis=1) train_removed_all.to_csv(outFile, sep='\t', index=None)
def feature_engineering(self, x_data, y_data, train=None): #特征选择 cols = x_data.columns # 消耗 consume_col = cols[0:10] # 招募 recruit_col = cols[10:22] # 加速 acceleration_col = cols[22:32] # 建筑 build_col = cols[32:48] # 科技 science_col = cols[48:97] # pvp pvp_col = cols[97:103] # 付费 pay_col = cols[103:106] # label # label_col = cols[108] if train: fs = FeatureSelector(data=x_data, labels=DataFrame(y_data)) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99 }) self.drop_columns = fs.ops with open('drop_columns.pkl', 'wb') as file: pickle.dump(self.drop_columns, file) self.feature_df = fs.remove(methods='all', keep_one_hot=False) else: drop_list = [] for key in self.drop_columns.keys(): for value in self.drop_columns[key]: drop_list.append(value) self.feature_df.drop(drop_list, axis=1, inplace=True) print(self.drop_columns)
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
def remove_unnecessary_features(self, auto=False): if auto: self.processed_data = self.processed_data.drop( columns=self.predefined_skip_features) else: fs = FeatureSelector(data=self.processed_data.drop("label", axis=1), labels=self.processed_data["label"]) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.98) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.99) fs.identify_single_unique() # Remove the features from all methods (returns a df) labels = self.processed_data["label"] self.processed_data = fs.remove(methods='all') self.processed_data["label"] = labels
def select_best_features(data_file_path, saveto_path="Default"): mod_data_file_path = strip_header(data_file_path) if saveto_path == "Default": saveto_path = replace_ext(data_file_path, '_reduced.csv') X = pd.read_csv(mod_data_file_path) y = X['Label'] X = X.drop(columns=['Label']) feature_selector = FeatureSelector(data=X, labels=y) feature_selector.identify_single_unique() feature_selector.identify_collinear(correlation_threshold=0.98) feature_selector.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) features_1hot = feature_selector.one_hot_features features_base = feature_selector.base_features feature_selector.identify_low_importance(cumulative_importance=0.99) X_dash = feature_selector.remove(methods=[ 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=False) X_dash['Label'] = y X_dash.to_csv(saveto_path, index=False) meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)] with open(saveto_path, 'r') as fh: contents = fh.read() contents = ','.join(meta_data) + '\n' + contents with open(saveto_path, 'w') as fh: fh.write(contents) os.system("rm -f " + mod_data_file_path)
fs.identify_single_unique() single_unique = fs.ops['single_unique'] fs.identify_collinear(correlation_threshold=0.95) correlated_features = fs.ops['collinear'] fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) zero_importance_features = fs.ops['zero_importance'] fs.identify_low_importance(cumulative_importance=0.99) low_importance_features = fs.ops['low_importance'] X_train = fs.remove(methods='all', keep_one_hot=False) X_test = X_test.drop(columns=fs.removed_features) test = test.drop(columns=fs.removed_features) clf1 = RandomForestClassifier(n_estimators=8000, max_depth=8) clf1.fit(X_train, np.ravel(y_train)) pred = clf1.predict(X_test) score = roc_auc_score(y_test, pred) #print(est, md, score) final = clf1.predict(test) final = pd.Series(final) answer = pd.concat([test1['index'], final], axis=1) answer.columns = ['index', 'TARGET'] answer.to_csv("submission.csv", index=False)
# Feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns) # Feature selection (remove highly correlated features) from feature_selector import FeatureSelector n = len(X_train.T) fs = FeatureSelector(data=X_train) fs.identify_collinear( correlation_threshold=0.7) # select features from training set corr = fs.ops['collinear'] X_train = fs.remove(methods=['collinear' ]) # remove selected features from training set to_remove = pd.unique( fs.record_collinear['drop_feature']) # features to remove X_test = X_test.drop( columns=to_remove) # remove selected features from test set # Create the artificial neural network import keras from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout num_input_nodes = len(X_train.T) num_output_nodes = 1 num_hidden_nodes = int( (num_input_nodes + num_output_nodes) / 2) # a typical value
Now, we will apply collinear, zero importance, low importance and sigle unique feature importance to selecti which columns to remove """ fs.identify_collinear(correlation_threshold=0.975) fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) fs.identify_low_importance(cumulative_importance = 0.99) fs.identify_single_unique() to_remove = fs.check_removal() feature_df = fs.remove( methods = ['collinear', 'zero_importance', 'low_importance', 'single_unique'], keep_one_hot=False ) feature_df.head() category_columns = [ 'protocol_type', 'service', 'flag' ] feature_df[category_columns] = feature_df[category_columns].astype('category') categories = { 'protocol_type' : feature_df.protocol_type.cat.categories, 'service' : feature_df.service.cat.categories,
# list of zero importance features zero_importance_features = fs.ops['zero_importance'] # %% #尋找低貢獻的Feature #當重要貢獻度的feautures累積超過0.99後,剩下就是低貢獻features fs.identify_low_importance(cumulative_importance=0.9) fs.record_low_importance # %% #排序找出貢獻高的因子 fs.feature_importances.sort_values(by='cumulative_importance') # %% #method可以客製化你想要先去除的 train_removed = fs.remove(methods='all') # %% all_to_remove = fs.check_removal() all_to_remove # %% def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9)
train_data.append(df) # Define name of 12 features set file_name = ["AtomPairs2D","AtomPairs2DCount","EState", "Extended", "Fingerprinterd", "GraphOnly", "KlekotaRoth", "KlekotaRothCount", "MACCS", "Pubchem", "Substructure", "SubstructureCount"] file_name = sorted(file_name) # Sorting name ################# #Load one train data for get labels train_label = pd.read_csv("Data/DILI_data/DILI_train_MF/DILI_train_AtomPairs2D.csv") # Start feature selecting and add labels for each training dataset for train, name in zip(train_data, file_name): feature_columns = [] labels = train_label["class."] X_train = train.drop(labels = "Name", axis = 1) fs = FeatureSelector(data = X_train, labels = labels) fs.identify_all(selection_params = {'missing_threshold': 0.8, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99,'num_threads':-1}) train_removed_all = fs.remove(methods = 'all', keep_one_hot=False) print('Original Number of Features', train.shape[1]) print('Final Number of Features: ', train_removed_all.shape[1]) train_removed_all.head() feature_columns.extend(train_removed_all.columns) feature_columns = pd.DataFrame(feature_columns,index=None) feature_columns.to_csv('Features_'+ name+'.csv',index = False, header = name) train_removed_all['class.']=labels train_removed_all.to_csv('Data/Feature_Data/Feature_Data/Feature_Train_'+ name + '.csv', index=False, header=True)
#feature_selector from feature_selector import FeatureSelector fs = FeatureSelector(data=train_data, labels=train_label) #find features with 0 variance fs.identify_single_unique() #recursive feature elimination fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=5, early_stopping=True) print("finish zero importance analysis") fs.identify_low_importance(cumulative_importance=0.99) print("finish low importance analysis") train_data = fs.remove(methods='all') print("finish removing train_data") for col in test_data.columns: if col in train_data.columns: continue else: test_data = test_data.drop([col], axis=1) print("done with feature selection!") print(f"training data: {train_data.shape}") print(f"testing data: {test_data.shape}") #lightgbm lgb_train = lgb.Dataset(train_data, train_label) lgb_eval = lgb.Dataset(test_data, test_label, reference=lgb_train) params = { 'boosting_type': 'gbdt',
# need to pass in a cumulative_importance that accounts for that fraction of total feature importance. fs.identify_low_importance(cumulative_importance=0.99) low_importance_features = fs.ops['low_importance'] print(low_importance_features[:5]) fs.plot_feature_importances(threshold=0.99, plot_n=12) plt.show() # 6 Removing Features # Removing Features: This method returns the resulting data which we can then use for machine learning. # The original data will still be accessible in the data attribute of the Feature Selector. train_no_missing = fs.remove(methods=['missing']) #以鉴别17种 train_no_missing_zero = fs.remove(methods=['missing', 'zero_importance']) #已经鉴别66+17=83种 all_to_remove = fs.check_removal() #检查所有要删除的features print(all_to_remove[0:]) train_removed = fs.remove(methods='all') #删除所有的不好的features # 7 Handling One-Hot Features train_removed_all = fs.remove(methods='all', keep_one_hot=False) print('Original Number of Features', train.shape[1]) print('Final Number of Features: ', train_removed_all.shape[1])
correlation_threshold表示特征之间的相关性 task指的是进行的任何,eval_metric表示使用的评价指标 cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95 """ fs = FeatureSelector(data=x, labels=y) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.9, 'task': 'regression', 'eval_metric': 'mse', 'cumulative_importance': 0.95 }) choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'], keep_one_hot=True) #根据选择得到的特征集来得到训练数据和测试数据集 x = x[choose.columns.values] X_predict = df_predict[choose.columns.values] choose.columns #因为存在样本不均衡问题,因而在选择测试数据集时,将50%为1的样本选做测试集 label_1 = train_data_1['target'] label_0 = train_data_0['target'] train_data_1 = train_data_1[choose.columns.values] train_data_0 = train_data_0[choose.columns.values] X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(train_data_0, label_0, test_size=.2, random_state=333)
#低重要度特征统计 fs.identify_low_importance(cumulative_importance=0.99) df_low_importance = fs.feature_importances print(df_low_importance.sort_values('importance', ascending=False).head(20)) #一次行运行所有函数 print('go') fs.identify_all( selection_params={ 'missing_threshold': 0.7, 'correlation_threshold': 0.99, 'task': 'classification', 'eval_metric': tpr_weight_funtion_lc, 'cumulative_importance': 0.999 }) #移除特征 # Remove the features from all methods (returns a df) left_feature, removed_feature = fs.remove(methods=[ 'missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=True) print('left_feature\n ', left_feature.columns, left_feature.shape) print( 'emoved_feature\n', len(removed_feature), '\n', removed_feature, )
print("# identify_low_importance") fs.identify_low_importance(cumulative_importance=0.99) low_importance_features = fs.ops["low_importance"] with open("low_importance.txt", "w") as f: for index, low_importance_feature in enumerate(low_importance_features): f.write("特征个数:{} 特征名称:{}\n".format(index + 1, low_importance_feature)) print("#-----------------------------------------#") print("\n") print("#-----------------------------------------#") print("移除上述方法判断出来的不需要特征") print("输出需要被移除的特征") feature_remove = fs.check_removal() for i in feature_remove: print("移除特征:{}".format(i)) data_remove_feature = fs.remove(methods="all") print("原始特征个数:{}".format(data.shape[1])) print("当前特征个数:{}".format(data_remove_feature.shape[1])) print("#-----------------------------------------#") print("\n") print("#---------------------------------#") print("剩下特征缺失值使用0来进行填充") data = data_remove_feature.replace(np.NaN, 0) if data.isnull().any().any(): print("数据集中存在数据缺失") print(data.shape[0] - data.count()) else: print("数据集中不存在参数缺失") print("#---------------------------------#") print("\n")
#-- Separate features from labels y = df['target'] train_labels = y df_feats = df.drop(columns = ['target']) #-- Create an instance fs = FeatureSelector(data = df_feats, labels = train_labels) #-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD) #-- Single unique value fs.identify_single_unique() #-- TO get keys fs.ops.keys() missing_features = list(fs.ops['missing']) corelated_features = list(fs.ops['collinear']) single_value = list(fs.ops['single_unique']) r = set(flatten([missing_features,corelated_features,single_value]))
X_train_df = mapper.fit_transform(X_train.copy()) X_test_df = mapper.transform(X_test.copy()) #feature selection. fs = FeatureSelector(data=X_train_df, labels=y_train) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99 }) fs.feature_importances.head() train_removed_all_once = fs.remove(methods='all', keep_one_hot=True) test_removed_all_once = X_test_df[train_removed_all_once.columns] #model train pipeline. xgb_param = { 'eta': 0.5, 'silent': 0, 'objective': 'binary:logistic', 'booster': 'gbtree', 'gamma': 0.0001, 'min_child_weight': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'eval_metric': 'auc', 'scale_pos_weight': 1, 'eval_train': 1