def process_woe_trans(in_data_path=None, rst_path=None, out_path=None, config_path=None): cfg = config.config() cfg.load_file(config_path, in_data_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans( cfg.dataset_train[r.var_name], r) cfg.dataset_train.to_csv(out_path)
def process_woe_trans(cfg=None, in_data_path=None, rst_path=None, out_path=None): # config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv' # data_path = in_data_path # cfg = config.config() # cfg.load_file(config_path, data_path) dataset = pd.read_csv(in_data_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(dataset, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation r = rst[0] dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path) print('%s\tSUCCESS EXPORT FILE: \n%s' % (time.asctime(time.localtime(time.time())), out_path))
def process_woe_trans(in_data_path=None,rst_path=None): config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model_201705.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) cfg.dataset_train = cfg.dataset_train.rename(columns={'cs_cpd':'cpd'}) # rename # dataset['raw_cs_cpd'] = dataset['cs_cpd'] for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans(cfg.dataset_train[r.var_name], r) return cfg.dataset_train
def cal_iv(df, cate_vars, cont_vars, target): #%% woe分箱, iv and transform df_woe = df civ_list = [] n_positive = sum(df[target]) n_negtive = len(df) - n_positive for var in cate_vars: civ = fp.proc_woe_discrete(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05) civ_list.append(civ) df_woe[var] = fp.woe_trans(df[var], civ) for var in cont_vars: civ = fp.proc_woe_continuous(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05) civ_list.append(civ) df_woe[var] = fp.woe_trans(df[var], civ) civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0927.csv') df_iv = civ_df[['var_name','iv']].drop_duplicates() return df_iv.sort_values('iv',ascending=False)
def get_iv(df, cols, target, outputfile='./data/feature_detail_iv_list.csv'): import woe.feature_process as fp import woe.eval as eval # 分别用于计算连续变量与离散变量的woe。它们的输入形式相同: # proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01) # # proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01) # 输入: # df: DataFrame,要计算woe的数据,必须包含'target'变量,且变量取值为{0,1} # var:要计算woe的变量名 # global_bt:全局变量bad total。df的正样本数量 # global_gt:全局变量good total。df的负样本数量 # min_sample:指定每个bin中最小样本量,一般设为样本总量的5%。 # alpha:用于自动计算分箱时的一个标准,默认0.01.如果iv_划分>iv_不划分*(1+alpha)则划分。 data = df.copy() data_woe = data data_woe.rename(columns={target: 'target'}, inplace=True) #用于存储所有数据的woe值 civ_list = [] n_positive = sum(data['target']) n_negtive = len(data) - n_positive for column in list(cols): if data[column].dtypes == 'object' or 'category': civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) else: civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) civ_list.append(civ) data_woe[column] = fp.woe_trans(data[column], civ) civ_df = eval.eval_feature_detail(civ_list, outputfile) #删除iv值过小的变量 # iv_thre = 0.001 # iv = civ_df[['var_name','iv']].drop_duplicates() # x_columns = iv.var_name[iv.iv > iv_thre] return civ_df
def process_woe_trans(cfg=None, rst=None, dataset=None, out_path=None): # fill null for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: dataset.loc[dataset[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: dataset.loc[dataset[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(dataset, cfg.variable_type) for r in rst: dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path)
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None): print time.asctime(time.localtime(time.time())), 'load config file' config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) print time.asctime(time.localtime(time.time())), 'fill na' dataset = pd.read_csv(in_data_path) print time.asctime(time.localtime( time.time())), 'fill na continuous variables' for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = -1 print time.asctime(time.localtime( time.time())), 'fill na discrete variables' for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = 'missing' print time.asctime(time.localtime(time.time())), 'change feature dtypes' fp.change_feature_dtype(dataset, cfg.variable_type) print time.asctime(time.localtime(time.time())), 'load woe rule' output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: print 'woe trans:', r.var_name dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path, index=False) print('%s\tSUCCESS EXPORT FILE: \n%s' % (time.asctime(time.localtime(time.time())), out_path))
if data[column].dtypes == 'object': info_value = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) else: info_value = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) info_value_list.append(info_value) data_woe[column] = fp.woe_trans(data[column], info_value) info_df = eval.eval_feature_detail(info_value_list, './dataDump/woe_info.csv') # 删除iv值过小的变量 iv_threshold = 0.001 iv = info_df[['var_name', 'iv']].drop_duplicates() x_columns = list(iv.var_name[iv.iv > iv_threshold]) data_woe = data_woe[x_columns] data_woe.to_csv('./dataDump/data_woe.csv') labels = np.array(data.iloc[:, 0]).reshape(data.shape[0], -1) data_train = np.array(data_woe) # Configure input
n_positive = sum(data['target']) n_negtive = len(data) - n_positive for column in list(data.columns[1:]): # if data[column].dtypes == 'object': # info_value = fp.proc_woe_discrete( # data, column, n_positive, n_negtive, 0.05*len(data), alpha=0.05) # else: info_value = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05) info_value_list.append(info_value) data_woe[column] = fp.woe_trans(data[column], info_value) folder = os.path.exists('./dataDump/') if not folder: os.makedirs('./dataDump/') info_df = eval.eval_feature_detail(info_value_list, './dataDump/woe_info.csv') folder = os.path.exists('./model/') if not folder: os.makedirs('./model/') with open('./model/woe_info.pkl', 'wb') as fw: pickle.dump(info_value_list, fw) # 删除iv值过小的变量
if alldata[i].dtypes == 'object': civ = fp.proc_woe_discrete(alldata, i, n_positive, n_negtive, 0.05 * len(alldata), alpha=0.05) else: civ = fp.proc_woe_continuous(alldata, i, n_positive, n_negtive, 0.05 * len(alldata), alpha=0.05) civ_list.append(civ) alldata[i] = fp.woe_trans(alldata[i], civ) civ_df = eval.eval_feature_detail(civ_list) iv_thre = 0.001 iv = civ_df[['var_name', 'iv']].drop_duplicates() # 计算特征的iv值,查看特征的重要性 ''' # 3.组合特征 # 根据风控部门提供的风险判断规则,构建报警数据的组合特征 # 思路:在15天内,是否出现多种报警记录 # 1.是否在一段时间内同时发生了低电、断电、离线报警,增加新字段 combineWarn1 alldata['combineWarn1'] = None for k in range(len(alldata)): if alldata['低电总报警次数'][k] != 0 and alldata['断电报警次数'][k] != 0 and alldata['离线超时报警次数'][k] != 0: alldata['combineWarn1'][k] = 1 else: alldata['combineWarn1'][k] = 0