def feature_engineering(self, x_data, y_data, train=None): #特征选择 cols = x_data.columns # 消耗 consume_col = cols[0:10] # 招募 recruit_col = cols[10:22] # 加速 acceleration_col = cols[22:32] # 建筑 build_col = cols[32:48] # 科技 science_col = cols[48:97] # pvp pvp_col = cols[97:103] # 付费 pay_col = cols[103:106] # label # label_col = cols[108] if train: fs = FeatureSelector(data=x_data, labels=DataFrame(y_data)) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99 }) self.drop_columns = fs.ops with open('drop_columns.pkl', 'wb') as file: pickle.dump(self.drop_columns, file) self.feature_df = fs.remove(methods='all', keep_one_hot=False) else: drop_list = [] for key in self.drop_columns.keys(): for value in self.drop_columns[key]: drop_list.append(value) self.feature_df.drop(drop_list, axis=1, inplace=True) print(self.drop_columns)
#-- Separate features from labels y = df['target'] train_labels = y df_feats = df.drop(columns = ['target']) #-- Create an instance fs = FeatureSelector(data = df_feats, labels = train_labels) #-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD) #-- Single unique value fs.identify_single_unique() #-- TO get keys fs.ops.keys() missing_features = list(fs.ops['missing']) corelated_features = list(fs.ops['collinear'])
early_stopping=True) # list of zero importance features zero_importance_features = fs.ops['zero_importance'] print('zero_importance_features', zero_importance_features) #低重要度特征统计 fs.identify_low_importance(cumulative_importance=0.99) df_low_importance = fs.feature_importances print(df_low_importance.sort_values('importance', ascending=False).head(20)) #一次行运行所有函数 print('go') fs.identify_all( selection_params={ 'missing_threshold': 0.7, 'correlation_threshold': 0.99, 'task': 'classification', 'eval_metric': tpr_weight_funtion_lc, 'cumulative_importance': 0.999 }) #移除特征 # Remove the features from all methods (returns a df) left_feature, removed_feature = fs.remove(methods=[ 'missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=True) print('left_feature\n ', left_feature.columns, left_feature.shape) print( 'emoved_feature\n', len(removed_feature),
'residentAddr'] = df[df['isNew'] == 0]['residentAddr'].apply( lambda x: x if x == -999 else x - 300000) #特征选择,特征选择的参数解释: """ missing_threshold表示数据特征缺失值比例阈值,当缺失值比例超过0.6时则删除该特征 correlation_threshold表示特征之间的相关性 task指的是进行的任何,eval_metric表示使用的评价指标 cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95 """ fs = FeatureSelector(data=x, labels=y) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.9, 'task': 'regression', 'eval_metric': 'mse', 'cumulative_importance': 0.95 }) choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'], keep_one_hot=True) #根据选择得到的特征集来得到训练数据和测试数据集 x = x[choose.columns.values] X_predict = df_predict[choose.columns.values] choose.columns #因为存在样本不均衡问题,因而在选择测试数据集时,将50%为1的样本选做测试集 label_1 = train_data_1['target'] label_0 = train_data_0['target']