def __bestks_bin(self, single_series, y): df_input = _Basic.basic_prepare(y, self.good, self.bad) self.kwargs['df_'] = pd.DataFrame(y) self.kwargs['response'] = y.name part_, single_series, special = CutMethods.cut_method_flow( single_series, **self.kwargs) uncheck_len = 2 if self.strict_monotonicity else 3 # 严格单调性 arrays = [] for group in part_: if len(group) < 2: # 用于special存在的情况 arrays.append(group) elif len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): arrays.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): arrays.append(group) except KeyError as error: logging.error(error) ivs = 0 df_last = None cut_last = [] if not arrays: return False for array in arrays: if special[0]: # 将特殊点加进去 if special[1] == 0.01: array[0] = special[0] + special[1] array.append(special[0]) array.sort() df_input_tmp = copy.deepcopy(df_input) out = pd.cut(single_series, array, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value ]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input_tmp[single_series.name] = out df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name, single_series.name) df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp) df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input_tmp) iv_sum = df_output["IV"].sum() if ivs < iv_sum: cut_last = array ivs = iv_sum df_last = copy.deepcopy(df_output) df_last.sort_index(inplace=True) return cut_last, df_last
def __get_tmp_woe(df_n, series_, group, response): df_n[series_.name] = pd.cut(series_, group, include_lowest=True) df_n[series_.name] = df_n[series_.name].cat.add_categories( [kwargs["fill_value"]]).fillna(kwargs["fill_value"]) df_n[series_.name] = df_n[series_.name].cat.remove_unused_categories() tmp_table = _Basic.get_pivot_table(df_n, response, series_.name) tmp_table = _Basic.add_basic_info_to_df(tmp_table) tmp_table = _Basic.add_woe_iv_to_df(tmp_table) tmp_table = tmp_table.dropna() tmp_woe = tmp_table[tmp_table.index != kwargs["fill_value"]]["WOE"] df_n[series_.name] = series_ return tmp_woe
def single_bin(self, single_series, y): """当一列的类别<=12种的时候,直接对这些数据进行分组 :return: "single",df """ df_input = _Basic.basic_prepare(y, self.good, self.bad) df_input[single_series.name] = single_series df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name) df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) df_input = _Basic.add_ks_to_df(df_input) df_input = df_input.sort_index() return 'single', df_input
def get_sample_bestks(df_, test): kwargs["cut_method"] = "bestks" print "*****************************{0}**************************************".format( kwargs["cut_method"]) part_, test = _CutMethods.cut_method_flow(test, kwargs, response="code", df_=df_) # groups为切割的组合,后面进行排列组合计算iv的值 df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"]) uncheck_len = 4 if kwargs["strict_monotonicity"]: # 严格单调性 uncheck_len = 3 arrays = [] for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(test, group, kwargs): arrays.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(test, group, kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code", kwargs) if _Basic.check_monotonic(tmp_woe, kwargs): arrays.append(group) except KeyError as error: logging.error(error) # 筛选出符合单调的要求和分布要求的,再筛选出iv最大的一个 ivs = 0 df_last = None for array in arrays: df_input_tmp = copy.deepcopy(df_input) out = pd.cut(test, array, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([kwargs["fill_value"] ]).fillna(kwargs["fill_value"]) out = out.cat.remove_unused_categories() df_input_tmp["test"] = out df_input_tmp = _Basic.get_pivot_table(df_input_tmp, "code", "test") df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp) df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input_tmp) iv_sum = df_output["IV"].sum() # print df_output if ivs < iv_sum: ivs = iv_sum df_last = copy.deepcopy(df_output) print df_last print "*****************************{0}**************************************".format( kwargs["cut_method"])
def __general_bin(self, single_series, y): array = [] df_input = _Basic.basic_prepare(y, self.good, self.bad) part_, single_series, special = CutMethods.cut_method_flow( single_series, **self.kwargs) uncheck_len = 2 if self.strict_monotonicity else 3 # 严格单调性 for group in part_: if len(group) < 2: # 用于special存在的情况 array.append(group) elif len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): array.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): array.append(group) except KeyError as error: logging.error(error) if not array: return False cut_points = array[-1] if special[0]: # 将特殊点加进去 if special[1] == 0.01: cut_points[0] = special[0] + special[1] cut_points.append(special[0]) cut_points.sort() out = pd.cut(single_series, cut_points, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input[single_series.name] = out df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name) df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input) df_output.sort_index(inplace=True) return cut_points, df_output
def get_sample(df_, test): kwargs["cut_method"] = "quantile" print "*****************************{0}**************************************".format( kwargs["cut_method"]) df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"]) array = [] part_, test = _CutMethods.cut_method_flow(test, kwargs) # part_各个分位数的值,test原始数据 uncheck_len = 4 if kwargs["strict_monotonicity"]: # 严格单调性 uncheck_len = 3 for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(test, group, kwargs): array.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(test, group, kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code", kwargs) if _Basic.check_monotonic(tmp_woe, kwargs): array.append(group) # else: # break # else: # 不满足单调舍弃 # break except KeyError as error: logging.error(error) print(array) out = pd.cut(test, array[-1], include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([kwargs["fill_value"] ]).fillna(kwargs["fill_value"]) out = out.cat.remove_unused_categories() df_input["test"] = out df_input = _Basic.get_pivot_table(df_input, "code", "test") df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input) print df_output print "*****************************{0}**************************************".format( kwargs["cut_method"])
def get_sample(): df_ = pd.DataFrame() df_["code"] = np.zeros(1000) df_["code"][:300] = 1 test = pd.Series(np.random.rand(1000), name="test") test[270:350] = np.NaN df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"]) test = test.round(3) array = [] part_, test = _CutMethods.cut_method_flow(test, kwargs) uncheck_len = 4 if kwargs["strict_monotonicity"]: uncheck_len = 3 for group in part_: if len(group) <= uncheck_len: array.append(group) else: try: tmp_woe = __get_tmp_woe(df_input, test, group, "code") if __check_monotonic(tmp_woe): array.append(group) else: break except KeyError as error: logging.error(error) out = pd.cut(test, array[-1], include_lowest=True) out = out.cat.add_categories([kwargs["fill_value"] ]).fillna(kwargs["fill_value"]) out = out.cat.remove_unused_categories() df_input["test"] = out df_input = _Basic.get_pivot_table(df_input, "code", "test") df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) df_input = _Basic.add_ks_to_df(df_input) print df_input
def woe_bin(self, single_series, y): df_input = _Basic.basic_prepare(y, bad=self.bad, good=self.good) df_input[single_series.name] = single_series df_input = _Basic.get_pivot_table(df_input, response=y.name, column=single_series.name) df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) if self.woe_inf_fill == 'avg': # todo:填充woe的值,现在是平均值 avg = df_input["WOE"].replace(self.fill_items, np.nan).mean() df_input["WOE"] = df_input["WOE"].replace(self.fill_items, avg) df_input = _Basic.add_ks_to_df(df_input) last_cut_points = [] choice_item_value = 0 parts1 = CutMethods.quantile_cut_flow(df_input['WOE'], max_cut_part=self.max_cut_part, min_cut_part=self.min_cut_part) parts2 = CutMethods.cumsum_cut_flow(df_input['WOE'], add_min_group=True, max_cut_part=self.max_cut_part, min_cut_part=self.min_cut_part) parts = parts1 + parts2 for part in parts: # 目前是取iv最大的组,未来可以加取分组数最多的组 part = sorted(list(set(part) - {-np.inf, np.inf, np.nan})) if len(part) <= 2: continue bins_ = pd.cut(df_input['WOE'], part, include_lowest=True) df_input['bins'] = bins_.cat.codes tmp_df = pd.pivot_table(df_input, values=['Bad_count', 'Good_count'], index='bins', aggfunc=np.sum) tmp_df = _Basic.add_basic_info_to_df(tmp_df) tmp_df['total_percent'] = tmp_df['total'].div( tmp_df['total'].sum()) if tmp_df['total_percent'].min( ) < self.group_min_percent: # 判断每组最小占比 continue tmp_df = _Basic.add_woe_iv_to_df(tmp_df) tmp_df = _Basic.add_ks_to_df(tmp_df) if self.choice_by.lower() == "iv": item_value = tmp_df['IV'].sum() elif self.choice_by.lower() == 'woe': item_value = tmp_df['WOE'].sum() elif self.choice_by.lower() == 'ks': item_value = tmp_df['KS'].max() elif self.choice_by.lower() == 'len': item_value = tmp_df.shape[0] else: raise ValueError('"choice_by" support ["iv","woe","ks","len"],' 'But get "{0}"'.format(self.choice_by)) if item_value > choice_item_value: last_cut_points = part if last_cut_points == []: return False, False df_input['bins'] = pd.cut(df_input['WOE'], last_cut_points, include_lowest=True).cat.codes groups = {} # 分组号和每组的列表{1:["ZJ","SH"],2:["XJ","BJ"]} for i in df_input['bins'].unique(): items_ = df_input[df_input['bins'] == i].index.tolist() groups[i] = items_ df_ = _Basic.basic_prepare(y, bad=self.bad, good=self.good) df_[single_series.name] = single_series for code, items in groups.iteritems(): df_[single_series.name] = df_[single_series.name].replace( items, code) df_ = _Basic.get_pivot_table(df_, y.name, single_series.name) df_ = _Basic.add_basic_info_to_df(df_) df_ = _Basic.add_woe_iv_to_df(df_) df_ = _Basic.add_ks_to_df(df_) bins_ = pd.Series(['|'.join(i) for i in groups.values()], index=[i for i in groups.keys()], name='var_scope') df_ = pd.concat([df_, bins_], axis=1) return groups, df_ # ({1:["ZJ","SH"],2:["XJ","BJ"]}, df)