def _get_info_table(df_, column, response): df_ = copy.deepcopy(df_) df_ = _Basic.get_pivot_table(df_, column=column, response=response) df_ = _Basic.add_basic_info_to_df(df_) # df_ = _Basic.add_woe_iv_to_df(df_) df_["cum_good_percent"] = df_["Good_count"].cumsum().div( np.dot(df_["Good_count"].sum(), np.ones(df_.shape[0]))) df_["cum_bad_percent"] = df_["Bad_count"].cumsum().div( np.dot(df_["Bad_count"].sum(), np.ones(df_.shape[0]))) df_["KS"] = df_["cum_good_percent"].sub(df_["cum_bad_percent"]).abs() return df_
def __bestks_bin(self, single_series, y): df_input = _Basic.basic_prepare(y, self.good, self.bad) part_, single_series = _CutMethods.cut_method_flow(single_series, self.kwargs, response=y.name, df_=pd.DataFrame(y)) uncheck_len = 3 if self.strict_monotonicity else 4 # 严格单调性 arrays = [] for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): arrays.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): arrays.append(group) except KeyError as error: logging.error(error) ivs = 0 df_last = None cut_last = [] if not arrays: return False for array in arrays: df_input_tmp = copy.deepcopy(df_input) out = pd.cut(single_series, array, include_lowest=True, right=False) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value ]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input_tmp[single_series.name] = out df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name, single_series.name) df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp) df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input_tmp) iv_sum = df_output["IV"].sum() # print df_output if ivs < iv_sum: cut_last = array ivs = iv_sum df_last = copy.deepcopy(df_output) # print(df_last) return cut_last, df_last
def __general_bin(self, single_series, y): array = [] df_input = _Basic.basic_prepare(y, self.good, self.bad) part_, single_series = _CutMethods.cut_method_flow( single_series, self.kwargs) uncheck_len = 3 if self.strict_monotonicity else 4 # 严格单调性 for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): array.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): array.append(group) except KeyError as error: logging.error(error) if not array: return False out = pd.cut(single_series, array[-1], include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input[single_series.name] = out df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name) print(df_input) df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input) # print(df_output) return array[-1], df_output
def get_sheet1_and_sheet2(df_train, df_test=None, drop_cols=None, bad=1, good=0, cut_method="cumsum", max_cut_part=10, min_group_num=3, group_min_percent=0.05, check_monotonicity=True, strict_monotonicity=True, fill_value="-999", response='code', enable_iv_threshold=True, iv_min=0.02, keep_largest=2000, enable_single_threshold=True, single_threshold=0.8, include_none=True, n_jobs=4): """ 对数据进行合并,生成sheet1和sheet2用来生成excel和csv """ if drop_cols is None: drop_cols = ["uid", "umobile", "date"] else: drop_cols = list(set(["uid", "umobile", "date"] + drop_cols)) df_train = df_train.drop(drop_cols, axis=1) df_train_num = df_train.select_dtypes(exclude=["object"]) if response not in df_train_num: df_train_num[response] = df_train[response] df_train_obj = df_train.select_dtypes(include=["object"]) if response not in df_train_obj: df_train_obj[response] = df_train[response] num_cols = list(df_train_num.drop(response, axis=1).columns) sheet1_cols = [ 'var', 'iv', 'ks', 'group_num', 'missing_percent', 'max_single_percent', 'min', '25%', '50%', '75%', 'max', 'std', 'mean', 'mode', 'median' ] sheet2_cols = [ 'Bad_count', 'Good_count', 'IV', 'KS', 'WOE', 'cum_bad_percent', 'cum_good_percent', 'default_percent', 'inside_bad_percent', 'inside_good_percent', 'total_percent_x', 'total_x', 'var_name', 'var_new_x', 'var_scope' ] if df_test is not None: sheet1_cols.insert(3, 'psi') sheet2_cols.insert(4, 'PSI') sheet2_cols.append('var_new_y') sheet2_cols.append('total_y') sheet2_cols.append('total_percent_y') log("开始进行连续性变量分箱操作:{0}".format(cut_method)) binning = Binning(cut_method=cut_method, strict_monotonicity=strict_monotonicity, min_proportion=group_min_percent, bad=bad, good=good, max_cut_part=max_cut_part, fill_value=fill_value) results = binning.fit(df_train, df_train[response], columns=num_cols, n_jobs=n_jobs) # results 结构为 {name:(bins,df),name:(bins,df)} log("分箱结束!开始进行连续性数据聚合...") sheet1 = {} sheet2 = {} new_bins = {} for name, value in results.iteritems( ): # name:变量名,value: ([分箱的点],分箱完的DataFrame) if value is False: # 如果分箱失败 continue df_ = value[1].sort_index() # 添加到sheet2当中 s_ = df_train[name] df_["total_x"] = df_["Bad_count"] + df_["Good_count"] df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum() if enable_single_threshold is True: # 单一性验证 if df_["total_percent_x"].max() > single_threshold: continue if enable_iv_threshold is True: # iv筛选 if df_["IV"].replace([np.inf, -np.inf], 0).sum() < iv_min: continue if df_.shape[0] < min_group_num: # 最小分组筛选 continue asC_name = name + "_asC" # 计算第二张表 df_["var_name"] = [asC_name] * df_.shape[0] df_["var_new_x"] = df_.index df_[asC_name] = df_["var_new_x"].cat.codes df_.set_index(asC_name, inplace=True) # 最大和最小值修改为极大极小 cut_points = value[0] cut_points[0] = -np.inf cut_points[-1] = np.inf new_bins[name] = cut_points # 替换原先没有无穷的切割点 if df_test is not None: tmp_c = pd.cut(df_test[name], cut_points) else: tmp_c = pd.cut(s_, cut_points) if len(tmp_c.cat.categories) + 1 == df_.shape[0]: tmp_c = tmp_c.cat.add_categories([fill_value]).fillna(fill_value) df_["var_scope"] = tmp_c.cat.categories # 挑选指定顺序的列 if df_test is not None: tmp_c = tmp_c.to_frame() tmp_c.columns = ['bin'] tmp_c[response] = df_test[response] tmp_c["_tmp_count_"] = np.ones( tmp_c.shape[0]) # 给定一个具体的个数的1,可以指定数据类型 tmp_pivot_df = pd.pivot_table(tmp_c, index='bin', columns=response, aggfunc=len, values="_tmp_count_").fillna(0) tmp_pivot_df['total'] = tmp_pivot_df[good] + tmp_pivot_df[bad] df_['total_y'] = tmp_pivot_df['total'].values df_['total_percent_y'] = df_['total_y'].div(df_['total_y'].sum()) df_['PSI'] = df_["total_percent_x"].div( df_["total_percent_y"]).map(np.log).mul( df_["total_percent_x"].sub(df_["total_percent_y"])) df_['var_new_y'] = df_['var_new_x'] sheet2[name] = df_[sheet2_cols].replace([np.inf, -np.inf, np.nan], 0) # 计算第一张表 r = s_.describe().to_frame().T # 添加到sheet1当中 r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum() r["ks"] = df_["KS"].max() r["group_num"] = df_.shape[0] r["missing_percent"] = 1 - r["count"] / s_.shape[0] r["max_single_percent"] = (df_["total_x"] / df_["total_x"].sum()).max() r["mode"] = ','.join(s_.mode().astype(str).values) # s_.mode() r["median"] = s_.median() r["var"] = asC_name if df_test is not None: r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum() sheet1[name] = r[sheet1_cols] log("开始稀疏变量分箱操作") df_train_obj[response] = df_train_obj[response].replace( good, "Good_count").replace(bad, "Bad_count") for name in df_train_obj.columns: # i_province2 if enable_single_threshold is True: # 单一值筛选 max_count = float( df_train_obj.groupby(name)[response].count().max()) if max_count / df_train_obj.shape[0] > single_threshold: continue tmp_c = _Basic.basic_prepare(df_train_obj[response], bad=bad, good=good) tmp_c[name] = df_train_obj[name] tmp_c = _Basic.get_pivot_table(tmp_c, response=response, column=name) tmp_c = _Basic.add_basic_info_to_df(tmp_c) tmp_c = _Basic.add_woe_iv_to_df(tmp_c) tmp_c = _Basic.add_ks_to_df(tmp_c) print(name) # todo:字符串变量筛选(如省份) return sheet1, sheet2, new_bins
def merge_asD(self, results): show("分箱结束!开始进行数值型数据聚合...") sheet1 = {} sheet2 = {} new_bins = {} pg = Progressive(len(results), step=2) speed_of_progress = 0 for name, value in results.iteritems( ): # name:变量名,value: ([分箱的点],分箱完的DataFrame) pg.bar(speed_of_progress, "数值型数据聚合") speed_of_progress += 1 df_ = value[1] s_ = self.df_train[name] asD_name = name + "_asD" if value[0] is False: # 如果分箱失败 continue elif value[0] == 'single': df_["total_x"] = df_["Bad_count"] + df_["Good_count"] df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum() if self.enable_single_threshold is True: # 单一性验证 if df_["total_percent_x"].max() > self.single_threshold: continue if self.enable_iv_threshold is True: # iv筛选 if df_["IV"].replace([np.inf, -np.inf], 0).sum() < self.iv_min: continue df_["var_name"] = [asD_name] * df_.shape[0] df_["var_new_x"] = df_.index df_["var_scope"] = df_.index df_[asD_name] = range(0, df_.shape[0]) df_.set_index(asD_name, inplace=True) if self.df_test is not None: tmp_df = s_.to_frame().fillna(self.fill_value) tmp_df["_tmp_count_"] = np.ones(tmp_df.shape[0]) tmp_df = tmp_df.groupby(name).count() tmp_df.columns = ['total_y'] df_ = pd.concat([df_, tmp_df], axis=1) df_['total_percent_y'] = df_['total_y'].div( df_['total_y'].sum()) df_['PSI'] = df_["total_percent_x"].div( df_["total_percent_y"]).map(np.log).mul( df_["total_percent_x"].sub(df_["total_percent_y"])) df_['var_new_y'] = df_['var_new_x'] sheet2[name] = df_[self.sheet2_cols].replace( [np.inf, -np.inf, np.nan], 0) new_bins[name] = "single" sheet2[name] = df_[self.sheet2_cols].replace( [np.inf, -np.inf, np.nan], 0) r = s_.describe().to_frame().T # 添加到sheet1当中 r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum() r["ks"] = df_["KS"].max() r["group_num"] = df_.shape[0] r["missing_percent"] = 1 - r["count"] / s_.shape[0] r["max_single_percent"] = (df_["total_x"] / df_["total_x"].sum()).max() r["mode"] = ','.join(s_.mode().astype(str).values) # s_.mode() r["var"] = asD_name if self.df_test is not None: r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum() try: sheet1[name] = r[[ 'var', "iv", 'ks', 'psi', 'group_num', 'missing_percent', 'max_single_percent' ]] except Exception as e: sheet1[name] = r[[ 'var', "iv", 'ks', 'group_num', 'missing_percent', 'max_single_percent' ]] elif isinstance(value[0], dict): groups = value[0] df_["total_x"] = df_["Bad_count"] + df_["Good_count"] df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum() # if self.enable_single_threshold is True: # 单一性验证 # if df_["total_percent_x"].max() > self.single_threshold: # continue if self.enable_iv_threshold is True: # iv筛选 if df_["IV"].replace([np.inf, -np.inf], 0).sum() < self.iv_min: continue # 计算第二张表 df_["var_name"] = [asD_name] * df_.shape[0] groups_ = {k: '|'.join(v) for k, v in groups.iteritems()} df_["var_new_x"] = pd.Series(groups_) df_["var_scope"] = pd.Series(groups_) df_.index.name = asD_name if self.df_test is not None: test_df_ = _Basic.basic_prepare( self.df_test[self.response], bad=self.bad, good=self.good) test_df_[name] = self.df_test[name] for code, items in groups.iteritems(): test_df_[name] = test_df_[name].replace(items, code) test_df_ = _Basic.get_pivot_table(test_df_, self.response, name) test_df_ = _Basic.add_basic_info_to_df(test_df_) df_['total_y'] = test_df_['total'].values df_['total_percent_y'] = df_['total_y'].div( df_['total_y'].sum()) df_['PSI'] = df_["total_percent_x"].div( df_["total_percent_y"]).map(np.log).mul( df_["total_percent_x"].sub(df_["total_percent_y"])) df_['var_new_y'] = df_['var_new_x'] new_bins[name] = groups sheet2[name] = df_[self.sheet2_cols].replace( [np.inf, -np.inf, np.nan], 0) # 计算第一张表 r = s_.describe().to_frame().T # 添加到sheet1当中 r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum() r["ks"] = df_["KS"].max() r["group_num"] = df_.shape[0] r["missing_percent"] = 1 - r["count"] / s_.shape[0] r["max_single_percent"] = (df_["total_x"] / df_["total_x"].sum()).max() r["mode"] = ','.join(s_.mode().astype(str).values) # s_.mode() r["var"] = asD_name if self.df_test is not None: r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum() sheet1[name] = r[[ 'var', "iv", 'ks', 'psi', 'group_num', 'missing_percent', 'max_single_percent' ]] else: sheet1[name] = r[[ 'var', "iv", 'ks', 'group_num', 'missing_percent', 'max_single_percent' ]] else: raise ValueError("分箱返回的结果不符合预期{0}".format(type(value[0]))) return sheet1, sheet2, new_bins
def _bin_merge_asD(self, df_train_obj, columns): log("开始数值型变量分箱操作") sheet1 = {} sheet2 = {} new_bins = {} # df_train_obj[self.response] = df_train_obj[self.response].replace(self.good, "Good_count").replace(self.bad, "Bad_count") for name in columns: # i_province2 if self.enable_single_threshold is True: # 单一值筛选 max_count = float( df_train_obj.groupby(name)[self.response].count().max()) if max_count / df_train_obj.shape[0] > self.single_threshold: continue # 计算每个类型的占比,计算ks,iv等信息 tmp_c = _Basic.basic_prepare(df_train_obj[self.response], bad=self.bad, good=self.good) tmp_c[name] = df_train_obj[name] tmp_c = _Basic.get_pivot_table(tmp_c, response=self.response, column=name) tmp_c = _Basic.add_basic_info_to_df(tmp_c) tmp_c = _Basic.add_woe_iv_to_df(tmp_c) if self.woe_fill == 'avg': # 填充woe的值 avg = tmp_c["WOE"].replace(self.fill_items, np.nan).mean() tmp_c["WOE"] = tmp_c["WOE"].replace(self.fill_items, avg) tmp_c = _Basic.add_ks_to_df(tmp_c) # 利用上面生成的woe进行分箱,挑选iv最大的组 parts = [] last_cut_points = [] max_iv = 0 parts, _ = _CutMethods.quantile_cut_flow( tmp_c['WOE'], parts, max_cut_part=self.max_cut_part) parts, _ = _CutMethods.cumsum_cut_flow( tmp_c['WOE'], parts, add_min_group=True, max_cut_part=self.max_cut_part) for part in parts: # 目前是取iv最大的组,未来可以加取分组数最多的组 bins_ = pd.cut(tmp_c['WOE'], part, include_lowest=True) tmp_c['bins'] = bins_.cat.codes tmp_df = pd.pivot_table(tmp_c, values=['Bad_count', 'Good_count'], index='bins', aggfunc=np.sum) tmp_df = _Basic.add_basic_info_to_df(tmp_df) tmp_df = _Basic.add_woe_iv_to_df(tmp_df) iv_ = tmp_df['IV'].sum() if iv_ > max_iv: last_cut_points = part # 开始构建sheet1和sheet2 tmp_c['bins'] = pd.cut(tmp_c['WOE'], last_cut_points, include_lowest=True).cat.codes groups = {} for i in tmp_c['bins'].unique(): items_ = tmp_c[tmp_c['bins'] == i].index.tolist() groups[i] = items_ df_ = _Basic.basic_prepare(df_train_obj[self.response], bad=self.bad, good=self.good) df_[name] = df_train_obj[name] for code, items in groups.iteritems(): df_[name] = df_[name].replace(items, code) df_ = _Basic.get_pivot_table(df_, self.response, name) df_ = _Basic.add_basic_info_to_df(df_) df_ = _Basic.add_woe_iv_to_df(df_) df_ = _Basic.add_ks_to_df(df_) df_["total_x"] = df_["Bad_count"] + df_["Good_count"] df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum() # 计算第二张表 asD_name = name + "_asD" df_["var_name"] = [asD_name] * df_.shape[0] groups_ = {k: '|'.join(v) for k, v in groups.iteritems()} df_["var_new_x"] = pd.Series(groups_) df_["var_scope"] = pd.Series(groups_) if self.df_test is not None: test_df_ = _Basic.basic_prepare(self.df_test[self.response], bad=self.bad, good=self.good) test_df_[name] = self.df_test[name] for code, items in groups.iteritems(): test_df_[name] = test_df_[name].replace(items, code) test_df_ = _Basic.get_pivot_table(test_df_, self.response, name) test_df_ = _Basic.add_basic_info_to_df(test_df_) df_['total_y'] = test_df_['total'].values df_['total_percent_y'] = df_['total_y'].div( df_['total_y'].sum()) df_['PSI'] = df_["total_percent_x"].div( df_["total_percent_y"]).map(np.log).mul( df_["total_percent_x"].sub(df_["total_percent_y"])) df_['var_new_y'] = df_['var_new_x'] new_bins[name] = groups sheet2[name] = df_[self.sheet2_cols].replace( [np.inf, -np.inf, np.nan], 0) # 计算第一张表 s_ = df_train_obj[name] r = s_.describe().to_frame().T # 添加到sheet1当中 r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum() r["ks"] = df_["KS"].max() r["group_num"] = df_.shape[0] r["missing_percent"] = 1 - r["count"] / s_.shape[0] r["max_single_percent"] = (df_["total_x"] / df_["total_x"].sum()).max() r["mode"] = ','.join(s_.mode().astype(str).values) # s_.mode() r["var"] = asD_name if self.df_test is not None: r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum() try: sheet1[name] = r[[ 'var', "iv", 'ks', 'psi', 'group_num', 'missing_percent', 'max_single_percent' ]] except Exception as e: sheet1[name] = r[[ 'var', "iv", 'ks', 'group_num', 'missing_percent', 'max_single_percent' ]] return sheet1, sheet2, new_bins