def process_train_woe(cfg=None, feature_name=None, target=None): print 'run into process_train_woe: \n', feature_name, time.asctime( time.localtime(time.time())) feature_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\raw_data\\dataset_split_by_cols\\' feature_path = feature_path + feature_name + '.csv' feature = pd.read_csv(feature_path) rst = [] if feature.columns[0] in list(cfg.bin_var_list): feature.loc[feature[feature.columns[0]].isnull()] = -1 fp.change_feature_dtype(feature, cfg.variable_type) dataset = pd.merge(feature.reset_index(), target.reset_index()).drop('index', axis=1) var = feature.columns[0] del feature del target riv = fp.proc_woe_continuous(dataset, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05) else: # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n', time.asctime( time.localtime(time.time())) feature.loc[feature[feature.columns[0]].isnull()] = 'missing' fp.change_feature_dtype(feature, cfg.variable_type) dataset = pd.merge(feature.reset_index(), target.reset_index()).drop('index', axis=1) var = feature.columns[0] del feature del target riv = fp.proc_woe_discrete(dataset, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05) rst.append(riv) feature_detail = eval.eval_feature_detail(rst) rst_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\gendata\\WOE_Rule\\' rst_path = rst_path + feature_name + '.pkl' result = (riv, feature_detail) output = open(rst_path, 'wb') pickle.dump(result, output) output.close() return result
def get_cut_result(self): df = self.X.copy() df['target'] = self.y self.civ_dict = {} woe_fp = {} for column in list(df.columns): if column == 'target': continue self.log.info("------ Now dealing the var={}".format(column)) tmp_df = df[~df[column].isin(self.special_value[column])] if column in self.classVars: civ = fp.proc_woe_discrete(tmp_df, column, self.global_bt, self.global_gt, 0.05 * len(tmp_df), alpha=0.05) else: civ = fp.proc_woe_continuous(tmp_df, column, self.global_bt, self.global_gt, 0.05 * len(tmp_df), alpha=0.05) self.civ_dict[column] = civ woe_fp[column] = fp if self.file_name is not None: feature_detail = eval.eval_feature_detail([v for k, v in self.civ_dict.items()], self.file_name) else: feature_detail = None if self.is_changed: changed_df = df.copy() changed_df = changed_df.drop(['target'], axis=1) for column in list(df.columns): if column == 'target': continue changed_df[column] = woe_fp[column].woe_trans(df[column], self.civ_dict[column]) if self.is_tag: for column in list(df.columns): woe_list = [str(i) for i in self.civ_dict[column].woe_list] split_list = self.civ_dict[column].split_list if len(split_list) == 0: continue changed_df[column] = changed_df[column].map( lambda x: '(' + str(split_list[-1]) + ',' + 'inf]' if x == woe_list[-1] else '(' + str(split_list[woe_list.index(x) - 1]) + ',' + str( split_list[woe_list.index(x)]) + ']') else: changed_df = None return woe_fp, changed_df, feature_detail
def get_iv(df, cols, target, outputfile='./data/feature_detail_iv_list.csv'): import woe.feature_process as fp import woe.eval as eval # 分别用于计算连续变量与离散变量的woe。它们的输入形式相同: # proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01) # # proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01) # 输入: # df: DataFrame,要计算woe的数据,必须包含'target'变量,且变量取值为{0,1} # var:要计算woe的变量名 # global_bt:全局变量bad total。df的正样本数量 # global_gt:全局变量good total。df的负样本数量 # min_sample:指定每个bin中最小样本量,一般设为样本总量的5%。 # alpha:用于自动计算分箱时的一个标准,默认0.01.如果iv_划分>iv_不划分*(1+alpha)则划分。 data = df.copy() data_woe = data data_woe.rename(columns={target: 'target'}, inplace=True) #用于存储所有数据的woe值 civ_list = [] n_positive = sum(data['target']) n_negtive = len(data) - n_positive for column in list(cols): if data[column].dtypes == 'object' or 'category': civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) else: civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) civ_list.append(civ) data_woe[column] = fp.woe_trans(data[column], civ) civ_df = eval.eval_feature_detail(civ_list, outputfile) #删除iv值过小的变量 # iv_thre = 0.001 # iv = civ_df[['var_name','iv']].drop_duplicates() # x_columns = iv.var_name[iv.iv > iv_thre] return civ_df
def cal_iv(df, cate_vars, cont_vars, target): #%% woe分箱, iv and transform df_woe = df civ_list = [] n_positive = sum(df[target]) n_negtive = len(df) - n_positive for var in cate_vars: civ = fp.proc_woe_discrete(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05) civ_list.append(civ) df_woe[var] = fp.woe_trans(df[var], civ) for var in cont_vars: civ = fp.proc_woe_continuous(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05) civ_list.append(civ) df_woe[var] = fp.woe_trans(df[var], civ) civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0927.csv') df_iv = civ_df[['var_name','iv']].drop_duplicates() return df_iv.sort_values('iv',ascending=False)
def process_train_woe(infile_path=None,outfile_path=None,rst_path=None): print 'run into process_train_woe: \n',time.asctime(time.localtime(time.time())) config_path = 'E:\\Code\\Python_ML_Code\\cs_model\\config\\config_cs_model_pos_m2.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path,data_path) bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time())) print 'cfg.global_bt',cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: rst.append(fp.proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time())) for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append(fp.proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, outfile_path) print 'save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time())) output = open(rst_path, 'wb') pickle.dump(rst,output) output.close() return feature_detail,rst
column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) else: info_value = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) info_value_list.append(info_value) data_woe[column] = fp.woe_trans(data[column], info_value) info_df = eval.eval_feature_detail(info_value_list, './dataDump/woe_info.csv') # 删除iv值过小的变量 iv_threshold = 0.001 iv = info_df[['var_name', 'iv']].drop_duplicates() x_columns = list(iv.var_name[iv.iv > iv_threshold]) data_woe = data_woe[x_columns] data_woe.to_csv('./dataDump/data_woe.csv') labels = np.array(data.iloc[:, 0]).reshape(data.shape[0], -1) data_train = np.array(data_woe) # Configure input data_train = Variable(Tensor(data_train).type(Tensor)) labels = Variable(Tensor(labels).type(Tensor))
cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] # process woe transformation of continuous variables for var in cfg.bin_var_list: rst.append( fp.proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) # process woe transformation of discrete variables for var in cfg.discrete_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append( fp.proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, 'output_feature_detail.csv')
civ = fp.proc_woe_discrete(alldata, i, n_positive, n_negtive, 0.05 * len(alldata), alpha=0.05) else: civ = fp.proc_woe_continuous(alldata, i, n_positive, n_negtive, 0.05 * len(alldata), alpha=0.05) civ_list.append(civ) alldata[i] = fp.woe_trans(alldata[i], civ) civ_df = eval.eval_feature_detail(civ_list) iv_thre = 0.001 iv = civ_df[['var_name', 'iv']].drop_duplicates() # 计算特征的iv值,查看特征的重要性 ''' # 3.组合特征 # 根据风控部门提供的风险判断规则,构建报警数据的组合特征 # 思路:在15天内,是否出现多种报警记录 # 1.是否在一段时间内同时发生了低电、断电、离线报警,增加新字段 combineWarn1 alldata['combineWarn1'] = None for k in range(len(alldata)): if alldata['低电总报警次数'][k] != 0 and alldata['断电报警次数'][k] != 0 and alldata['离线超时报警次数'][k] != 0: alldata['combineWarn1'][k] = 1 else: alldata['combineWarn1'][k] = 0
def process_train_woe(dataset, outfile_path=None, rst_path=None, config_path=None, min_sample_weight_config=None): ''' Process training data for WOE Parameters ---------- dataset : Pandas dataframe of training dataset. Includes 'target' column. outfile_path : Path for WOE feature details output. rst_path : Path for WOE InfoValue object output. config_path : Path to read config file from. min_sample_weight : Adjust the percentage of samples required for leaf. Return ------ feature_detail : WOE feature details rst : List of InfoValue instances ''' # Load config cfg = config.config() cfg.load_file(config_path) cfg.set_dataset(dataset) cfg.load_min_sample_weight_config(min_sample_weight_config) print(cfg.min_sample_weight_config) # Prepare variable list bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] discrete_var_list = [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ] # Impute missing values for features to be binned for var in bin_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # Cast feature dtypes change_feature_dtype(cfg.dataset_train, cfg.variable_type) # Process woe transformation of continuous variables rst = [] for var in bin_var_list: iv_obj = proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.get_min_sample(var), alpha=0.05) rst.append(iv_obj) # Process woe transformation of discrete variables for var in discrete_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' iv_obj = proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.get_min_sample(var), alpha=0.05) rst.append(iv_obj) feature_detail = eval.eval_feature_detail(rst, outfile_path) # Write list of InfoValue instances to output path with open(rst_path, 'wb') as f: pickle.dump(rst, f) return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None, config_path=None): print('run into process_train_woe: ', time.asctime(time.localtime(time.time()))) data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path) bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] for var in bin_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] print('process woe transformation of continuous variables: ', time.asctime(time.localtime(time.time()))) print('cfg.global_bt', cfg.global_bt) print('cfg.global_gt', cfg.global_gt) # 处理连续变量 for var in bin_var_list: rst.append( proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) # 处理离散变量 print('process woe transformation of discrete variables: ', time.asctime(time.localtime(time.time()))) for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append( proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, outfile_path) print('save woe transformation rule into pickle: ', time.asctime(time.localtime(time.time()))) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None): print 'run into process_train_woe: \n', time.asctime( time.localtime(time.time())) config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model_lr.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path) # rst = [] output = open(rst_path, 'rb') rst = pickle.load(output) output.close() exists_var_list = [rst[i].var_name for i in range(rst.__len__())] bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n', time.asctime( time.localtime(time.time())) print 'cfg.global_bt', cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: print var if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n', time.asctime( time.localtime(time.time())) for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ]: print var # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' feature_detail = eval.eval_feature_detail(rst, outfile_path) return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None, config_path=None, rebin_feature_path=None): print('run into process_train_woe: \n', time.asctime(time.localtime(time.time()))) data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path, rebin_feature_path) bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] orig_dataset_train = cfg.dataset_train # change feature dtypes change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] print('cfg.global_bt', cfg.global_bt) print('cfg.global_gt', cfg.global_gt) print('cfg.global_categorical_missing', cfg.global_categorical_missing) print('cfg.global_numeric_missing', cfg.global_numeric_missing) # process woe transformation of continuous variables print('process woe transformation of continuous variables: \n', time.asctime(time.localtime(time.time()))) for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing rst.append( proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of continuous variables based on the re-binning logic provided print( 'process woe transformation of continuous variables based on rebin logic: \n', time.asctime(time.localtime(time.time()))) rebin_var_list = [ tmp for tmp in cfg.rebin_var_list if tmp in list(cfg.dataset_train.columns) ] for var in rebin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing var_df = cfg.dataset_rebin.loc[cfg.dataset_rebin['var_name'] == var] split_list = list(np.unique(var_df[['split']].astype(float))) rst.append( proc_woe_continuous_rebin(cfg.dataset_train, var, split_list, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of discrete variables print('process woe transformation of discrete variables: \n', time.asctime(time.localtime(time.time()))) discrete_var_list = [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ] for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing rst.append( proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of discrete variables based on re-binning logic print( 'process woe transformation of discrete variables based on rebin logic: \n', time.asctime(time.localtime(time.time()))) rebin_discrete_var_list = [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ] for var in [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing var_df = cfg.dataset_rebin.loc[cfg.dataset_rebin['var_name'] == var] var_df.loc[:, 'split'] = var_df['split'].astype(object) rebin_list = list(np.unique(var_df[['split']])) rst.append( proc_woe_discrete_rebin(cfg.dataset_train, var, rebin_list, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) feature_detail = woeeval.eval_feature_detail(rst, outfile_path) import pandas as pd pd.options.display.float_format = '{:.3f}'.format for var in bin_var_list + rebin_var_list + discrete_var_list + rebin_discrete_var_list: missing_obs = cfg.dataset_train.loc[cfg.dataset_train[var].isin( [cfg.global_numeric_missing, cfg.global_categorical_missing])].shape[0] print 'variable = ', var, '\t# obs = ', orig_dataset_train[var].shape[ 0], '\t# valid = ', ( orig_dataset_train[var].shape[0] - missing_obs), '\t% valid = ', ( orig_dataset_train[var].shape[0] - missing_obs) * 100.0 / (orig_dataset_train[var].shape[0]) df = feature_detail.loc[feature_detail['var_name'] == var] print(df[[ 'split_list', 'sub_total_sample_num', 'positive_sample_num', 'weight_positive_freq', 'weight_negative_freq', 'perc_cum_weight_freq', 'perc_cum_weight_positive_freq', 'perc_cum_weight_negative_freq', 'woe_list', 'iv_list', 'ks_list' ]]) woeeval.plot_woe(df, var) s = 'summary of WOE transformation' print(s.center(60, '-')) smry_df = feature_detail[['var_name', 'iv', 'maxks', 'linearity' ]].drop_duplicates().sort_values('iv', ascending=False) print(smry_df) print('save woe transformation rule into pickle: \n', time.asctime(time.localtime(time.time()))) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() return feature_detail, rst