def clean_data(df, use_fs=True): # convert object to categorical data if 'thal' in df.columns: string_labels = ['thal'] df[string_labels] = df[string_labels].apply(categorize_label, axis=0) df = pd.get_dummies(df, drop_first=True) # drop some columns to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment'] df.drop(to_drop, axis=1, inplace=True) # normalize high variance columns # high_variance_cols = ['resting_blood_pressure'] # df[high_variance_cols] = np.log(df[high_variance_cols]) # convert int to float # df = df.apply(lambda c : c.astype(float), axis=1) if use_fs: fs = FeatureSelector(data=df, labels=y) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.plot_feature_importances(threshold=0.99, plot_n=14) # print(train_removed_all_once) # standard scaling # scaler = RobustScaler() # df[df.columns] = scaler.fit_transform(df[df.columns]) # print(df.info()) # print('\nFeature Selector analysis') return df
def select_top_features(train_data): fs = FeatureSelector(train_data[0], train_data[1]) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=6, early_stopping=True) fs.identify_low_importance(cumulative_importance=0.99) return fs.ops['zero_importance'], fs.ops['low_importance']
def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.9) train_removed = fs.remove(methods='all') return train_removed
def featuresSel(train, train_labels, name): """Plots the curve for the importantant features Arguments: train {pandas.Dataframe} -- Dataset train_labels {numpy.ndarray} -- Labels for the dataset name {string} -- Name for file """ print('>>> Feature Selection...') fs = FeatureSelector(data=train, labels=train_labels) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) plt.figure(figsize=(15, 15)) fs.plot_feature_importances(threshold=0.99, plot_n=50, name=name) plt.savefig('../../data/figures/rank_{}.png'.format(name)) plt.close()
def Bestfeature_from_cummulative_importance(inFile, outFile): df = pd.read_csv(inFile, sep='\t') print(df.shape) train_labels = df['class_label'] train = df.drop(columns=['class_label']) fs = FeatureSelector(data=train, labels=train_labels) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) zero_importance_features = fs.ops['zero_importance'] #fs.plot_feature_importances(threshold = 0.99, plot_n = 12) importance_index = np.min( np.where(fs.feature_importances['cumulative_importance'] > 0.99)) fs.identify_low_importance(cumulative_importance=0.99) print(importance_index) train_removed_all = fs.remove(methods=['zero_importance'], keep_one_hot=False) train_removed_all = pd.concat([train_removed_all, train_labels], axis=1) train_removed_all.to_csv(outFile, sep='\t', index=None)
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
def remove_unnecessary_features(self, auto=False): if auto: self.processed_data = self.processed_data.drop( columns=self.predefined_skip_features) else: fs = FeatureSelector(data=self.processed_data.drop("label", axis=1), labels=self.processed_data["label"]) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.98) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.99) fs.identify_single_unique() # Remove the features from all methods (returns a df) labels = self.processed_data["label"] self.processed_data = fs.remove(methods='all') self.processed_data["label"] = labels
def select_best_features(data_file_path, saveto_path="Default"): mod_data_file_path = strip_header(data_file_path) if saveto_path == "Default": saveto_path = replace_ext(data_file_path, '_reduced.csv') X = pd.read_csv(mod_data_file_path) y = X['Label'] X = X.drop(columns=['Label']) feature_selector = FeatureSelector(data=X, labels=y) feature_selector.identify_single_unique() feature_selector.identify_collinear(correlation_threshold=0.98) feature_selector.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) features_1hot = feature_selector.one_hot_features features_base = feature_selector.base_features feature_selector.identify_low_importance(cumulative_importance=0.99) X_dash = feature_selector.remove(methods=[ 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=False) X_dash['Label'] = y X_dash.to_csv(saveto_path, index=False) meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)] with open(saveto_path, 'r') as fh: contents = fh.read() contents = ','.join(meta_data) + '\n' + contents with open(saveto_path, 'w') as fh: fh.write(contents) os.system("rm -f " + mod_data_file_path)
fs.identify_single_unique() single_unique = fs.ops['single_unique'] single_unique fs.plot_unique() fs.identify_collinear(correlation_threshold=0.975) correlated_features = fs.ops['collinear'] correlated_features[:5] fs.plot_collinear() fs.record_collinear.head() fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) one_hot_features = fs.one_hot_features base_features = fs.base_features print('There are %d original features' % len(base_features)) print('There are %d one-hot features' % len(one_hot_features)) fs.plot_feature_importances(threshold = 0.99, plot_n = 12) fs.feature_importances.head(10) if DO_DATA_EXPLORATION: discVar = 'target' contVar = 'feat_3766' boxplot(df, discVar,contVar)
fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear()) # list of collinear features to remove collinear_features = fs.ops['collinear'] print('collinear_features', collinear_features) # dataframe of collinear features df_collinear_features = fs.record_collinear.sort_values('corr_value', ascending=False) print('df_collinear_features', df_collinear_features.head(50)) #零重要度特征统计 # Pass in the appropriate parameters fs.identify_zero_importance(task='classification', eval_metric=tpr_weight_funtion_lc, n_iterations=10, early_stopping=True) # list of zero importance features zero_importance_features = fs.ops['zero_importance'] print('zero_importance_features', zero_importance_features) #低重要度特征统计 fs.identify_low_importance(cumulative_importance=0.99) df_low_importance = fs.feature_importances print(df_low_importance.sort_values('importance', ascending=False).head(20)) #一次行运行所有函数 print('go') fs.identify_all( selection_params={ 'missing_threshold': 0.7,
from sklearn.feature_selection import SelectKBest, chi2 import pandas as pd import numpy as np # columns = ['A_TS%', 'A_eFG%', 'A_3PAr', 'A_FTr', 'A_ORB%', 'A_DRB%', # 'A_TRB%', 'A_AST%', 'A_STL%', 'A_BLK%', 'A_TOV%', 'A_ORtg', 'A_DRtg', # 'H_TS%', 'H_eFG%', 'H_3PAr', 'H_FTr', 'H_ORB%', 'H_DRB%', # 'H_TRB%', 'H_AST%', 'H_STL%', 'H_BLK%', 'H_TOV%', 'H_ORtg', 'H_DRtg' # ] columns = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'ORtg', 'DRtg'] features = pd.DataFrame(adv_diff_features(None)) labels = pd.DataFrame(adv_diff_labels()) features.columns = columns labels.columns = ['POINT_DIFF'] print(len(features), len(labels)) fs = FeatureSelector(data=features, labels=labels) fs.identify_missing(missing_threshold=0.9) fs.identify_collinear(correlation_threshold=0.5) fs.plot_collinear() fs2 = FeatureSelector(data=features, labels=labels[:,]) fs2.identify_zero_importance(eval_metric='l2', task='regression') # fs2.identify_low_importance() print(fs.record_collinear.head())
print("# identify_zero_importance") print("使用LightGBM库训练GB集成算法,评价特征之间的重要性") print("1. 为了降低随机性,模型默认会训练10次") print( "2. 模型默认会采用 early stopping 的操作形式,使用15%的数据作为 validation data 去获取 optimal number of estimators" ) print("3. 需要使用到的参数") print(" task: classification or regression , metrics 与这个是相关的") print( " eval_metric: 用于 early stopping 的指标,auc for classification, L2 for regression" ) print(" n_iterations:训练的次数,默认是10次,feature importances会取10次计算结果的平均值") print(" early_stopping: 默认在训练的时候是使用 early stopping 模式的,early stopping") print(" 可以理解成一个 regulation,为了防止训练数据的过拟合") fs.identify_zero_importance(task="classification", eval_metric="auc", n_iterations=10, early_stopping=True) zero_importance_features = fs.ops["zero_importance"] with open("zero_importance.txt", "w") as f: for index, zero_importance_feature in enumerate(zero_importance_features): f.write("特征个数:{} 特征名称:{}\n".format(index, zero_importance_feature)) fs.plot_feature_importances(threshold=0.99, plot_n=20) plt.savefig("feature_importance.jpg", dpi=300) plt.show() one_hundred_features = list(fs.feature_importances.loc[:99, "feature"]) print("\n") print("# identify_low_importance")
constant_train = x.loc[:, (x == 0).all()].columns.tolist() print('Number of constant columns in the train set:', len(constant_train)) # %% fs = FeatureSelector(x, y) fs.identify_missing(missing_threshold=0.6) fs.record_missing.head() fs.plot_missing() # %% fs.identify_collinear(correlation_threshold=0.9) fs.record_collinear.head() # %% fs.identify_zero_importance(task='regression', eval_metric='rmse', n_iterations=10, early_stopping=True) # list of zero importance features zero_importance_features = fs.ops['zero_importance'] #%% fs.identify_low_importance(cumulative_importance=0.99999) fs.record_low_importance.head() fs.plot_feature_importances(50) # %% train_removed = fs.remove(methods='all') X_clean = train_removed # %% from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor import lightgbm as lgb
return (np.transpose(tt)) miss_data(df) # %% fs=FeatureSelector(x,y) fs.identify_missing (missing_threshold=0.6) fs.record_missing.head() fs.plot_missing() # %% fs.identify_collinear(correlation_threshold=0.9) fs.record_collinear.head() # %% fs.identify_zero_importance(task = 'classification', n_iterations = 10, early_stopping = False) # list of zero importance features zero_importance_features = fs.ops['zero_importance'] # %% fs.identify_low_importance(cumulative_importance=0.99) fs.record_low_importance.head() fs.plot_feature_importances(100) #%% train_removed=fs.remove(methods='all') X_clean=train_removed import seaborn as sns import matplotlib.pyplot as plt