def cut_main(self): breaks_adj = {} for col in self.dist_col: if len([i for i in list(self.df_train[col].unique()) if i == i]) <= 1: self.log.info("{} has been deleted, because only has one single value".format(col)) continue breaks_adj[col] = [i for i in list(self.df_train[col].unique()) if i == i] self.dist_col = list(breaks_adj.keys()) cut_cols = self.dist_col + self.serial_col max_bins = self.max_bins self.woe_df = pd.DataFrame(columns=["variable", "bin", "woe", "bin_iv", "bad", "badprob"]) self.bins_adj = {} while max_bins > 2 and len(cut_cols) > 0: self.log.info("现在是分{}箱, 有{}个变量需要分箱".format(max_bins, len(cut_cols))) if self.target_name not in cut_cols: cut_cols.append(self.target_name) bins_adj = sc.woebin(self.df_train[cut_cols], y=self.target_name, breaks_list=breaks_adj, bin_num_limit=max_bins, method=self.method) cut_cols = [] for key, value in bins_adj.items(): tmp = bins_adj[key].copy() if key in self.serial_col: tmp1 = tmp[tmp.bin != "missing"] if len(tmp1) == 1: continue else: tmp1["min"] = tmp1.bin.map( lambda x: float(x.split(",")[0].replace("[", "")) if x.find(",") > -1 else x) tmp1 = tmp1.sort_values(by="min") if not all(x < y for x, y in zip(tmp1.woe.tolist(), tmp1.woe.tolist()[1:])): cut_cols.append(key) continue self.woe_df = pd.concat([self.woe_df, tmp[["variable", "bin", "woe", "bin_iv", "bad", "badprob"]]]) max_bins = max_bins - 1 self.bins_adj = dict(self.bins_adj, **bins_adj) self.log.info("仍有{}个特征无法满足单调的需求,不能分箱".format(len(cut_cols))) self.woe_df.to_excel("{}_bin_result.xlsx".format(self.filename)) self.log.info("*" * 50) self.log.info("WOE Detail can be checked in {}_bin_result.xlsx".format(self.filename)) self.df_train_woe = sc.woebin_ply(self.df_train, self.bins_adj) self.df_train_woe.columns = [i.replace("_woe", "") for i in self.df_train_woe.columns] if self.df_test.any().any(): self.df_test_woe = sc.woebin_ply(self.df_test, self.bins_adj) self.df_test_woe.columns = [i.replace("_woe", "") for i in self.df_test_woe.columns] if self.df_ott.any().any(): self.df_ott_woe = sc.woebin_ply(self.df_ott, self.bins_adj) self.df_ott_woe.columns = [i.replace("_woe", "") for i in self.df_ott_woe.columns] self.log.info("CutBin has finished!")
def woe_trans(df,bin_dict,trans_feature,target="target"): """ woe 转换 :param df: origin data 包含 目标变量 :param bin_dict: dict :param trans_feature:list 转换变量 :param target: str 目标变量 :return: df_woe,new_feature """ df = IC.check_df(df) bin_dict = IC.check_dict_of_df(bin_dict) trans_feature = IC.check_list(trans_feature) if not set(trans_feature).issubset(set(list(bin_dict.keys()))): print("bin_dict.key",bin_dict.keys(),'\n') print("trans_feature:",trans_feature) warnings.warn("trans_feature not in bin_dict.keys, Please double check feature set") raise SystemExit(0) dt = df[trans_feature+[target]] df_woe = woebin_ply(dt=dt,bins=bin_dict) new_feature = [i+"_woe" for i in trans_feature] if not set(new_feature).issubset(set(df_woe.columns.difference([target]).tolist())): warnings.warn("new feature not in df_woe.columns, Please double check feature set") raise SystemExit(0) return df_woe,new_feature
def woe_transform(self, train, test): # includes var filtering and one-hot encoding of 'INDUSTRY' column in all data train = sc.var_filter(train, 'DEFAULT_FLAG', var_kp='INDUSTRY') self.encode_categorical(train) bins = sc.woebin(train, 'DEFAULT_FLAG') train_woe = sc.woebin_ply(train, bins) train_columns = [ 'ACCESS_CREDIT', 'ASSESSMENT_YEAR', 'MEDIUM_TERM_LIQUIDITY', 'OWNERS_MANAGEMENT', 'PRODUCT_DEMAND', 'PROFITABILITY', 'SHORT_TERM_LIQUIDITY', 'TURNOVER', 'DEFAULT_FLAG', 'INDUSTRY' ] test_selected = test[train_columns] self.encode_categorical(test_selected) test_woe = sc.woebin_ply(test_selected, bins) return train_woe, test_woe
num_bin, num_iv = rs.num_bin(df=train, cols=num_cols, target='target', specials=[-99999], bin_num_limit=8, count_distr_limit=0.05, sc_method='chimerge', non_mono_cols=['age_in_years'], init_bins=15, init_min_samples=0.05, init_method='chi') bins = {**cat_bin, **num_bin} ivs = {**cat_iv, **num_iv} # woe转换 train_woe = sc.woebin_ply(dt=train, bins=bins) valid_woe = sc.woebin_ply(dt=valid, bins=bins) df_woe = sc.woebin_ply(dt=df, bins=bins) ######################################### # 3.特征选择 ######################################### sf = rs.SelectFeature() high_iv = sf.baseOn_iv(ivd=ivs, thred=0.05, is_draw=False) low_vif = sf.baseOn_collinear(df=train_woe, high_iv=high_iv, thred=0.7, is_draw=False) ml_cols, best_C = sf.baseOn_l1(X=train_woe[low_vif.keys()], y=train_woe['target'], Kfold=5,
def transform( self, X): woe = sc.woebin_ply(X, self.bins) return woe
breaks_list=break_list) bins['JOB'] = job_bins['JOB'] # Plot WOE bins # fig, axs = plt.subplots(ncols=2) # sc.woebin_plot(bins, figsize=[8,5]) # Print results of binning # for k, bin_ in bins.items(): # print(bins[k].iloc[:,0:-2].round(2).to_latex(index=False)) # split into train and test set train, test = sc.split_df(df, 'BAD').values() # Convert values into woe train_woe = sc.woebin_ply(train, bins) test_woe = sc.woebin_ply(test, bins) # Add constant train_woe = sm.add_constant(train_woe) test_woe = sm.add_constant(test_woe) y_train = train_woe.loc[:, 'BAD'] X_train = train_woe.loc[:, train_woe.columns != 'BAD'] y_test = test_woe.loc[:, 'BAD'] X_test = test_woe.loc[:, train_woe.columns != 'BAD'] # Fit logit model lr = sm.GLM(y_train, X_train, family=sm.families.Binomial()) fit = lr.fit()
if __name__ == '__main__': train_data = pd.read_csv('./data/TrainData.csv') # 分箱(卡方 or tree) break_list = {'DebtRatio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1.7], 'NumberRealEstateLoansOrLines': [0, 1, 2, 3]} cutoff = sc.woebin(train_data, y='SeriousDlqin2yrs', method='chimerge', breaks_list=break_list) # print(cutoff["NumberOfTimes90DaysLate"]["woe"]) feature_index = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'] # x轴的标签 train_woe = sc.woebin_ply(train_data, cutoff) woe_index = ['{}_{}'.format(s, 'woe') for s in feature_index] woe_index.insert(0, 'SeriousDlqin2yrs') train_woe = train_woe.loc[:, woe_index] # test data test_data = pd.read_csv('./data/TestData.csv') test_woe = sc.woebin_ply(test_data, cutoff) test_woe = test_woe.loc[:, woe_index] # train lr x_train = train_woe.loc[:, train_woe.columns != 'SeriousDlqin2yrs'] y_train = train_woe.loc[:, 'SeriousDlqin2yrs'] x_train = x_train.drop( ['DebtRatio_woe', 'MonthlyIncome_woe', 'NumberOfOpenCreditLinesAndLoans_woe',
def execute_data(): start = time.time() try: global modelPath, binsPath, bins, model logger.info('入参:%s', str(request.get_data())) request_data = request.get_json() #获取传入数据 paramsJson = request_data['paramsData'] modelFilePath = modelFilePathCheck(request_data) if (type(paramsJson) == type({})): #原始数据中数值型字符串转为int64 items = paramsJson.items() for key, value in items: if (is_number(value)): paramsJson[key] = int(value) logger.info("paramsJson:%s", paramsJson) #构建dataFrame df = pd.json_normalize(paramsJson) if (modelPath != modelFilePath): logger.info('调用模型路径发生变化,重新加载模型!') logger.info('global modelFilePath:%s', modelPath) logger.info('param modelFilePath:%s', modelFilePath) modelPath = modelFilePath #导入模型 model = joblib.load(modelFilePath) else: logger.info('调用模型路径未发生变化,使用缓存中模型。') logger.info('global modelFilePath:%s', modelPath) #原始数据转换为woe值 bins = model.bins df_woe = sc.woebin_ply(df, bins) #打标签 label = model.predict(df_woe)[0] #构建评分卡 card = sc.scorecard(bins, model, df_woe.keys()) #评分 score = sc.scorecard_ply(df, card, only_total_score=False, print_step=0) #计算每个特征的得分 featureScore = {} # featureScore = calculateFeatures(df, card) if isinstance(card, dict): card_df = pd.concat(card, ignore_index=True) elif isinstance(card, pd.DataFrame): card_df = card.copy(deep=True) # x variables xs = card_df.loc[card_df.variable != 'basepoints', 'variable'].unique() for i in xs: featureScore[i] = score[i + '_points'][0] result = {} result['code'] = '00000' result['score'] = str(score['score'][0]) result['label'] = str(label) result['featureScore'] = featureScore end = time.time() logger.info("运行结果:%s,模型执行耗时:%s", result, end - start) return jsonify(result) code10002['errorMsg'] = '输入值错误:请传入json格式参数' return jsonify(code10002) except KeyError as e: logger.info(e) code10001['errorMsg'] = '输入特征错误:' + str(e) return jsonify(code10001) except ValueError as e: logger.info(e) code10002['errorMsg'] = '输入值错误:' + str(e) return jsonify(code10002) except Exception as e: logger.info(e) code10003['errorMsg'] = '未知错误:' + str(e) return jsonify(code10003)