def __bestks_bin(self, single_series, y): df_input = _Basic.basic_prepare(y, self.good, self.bad) self.kwargs['df_'] = pd.DataFrame(y) self.kwargs['response'] = y.name part_, single_series, special = CutMethods.cut_method_flow( single_series, **self.kwargs) uncheck_len = 2 if self.strict_monotonicity else 3 # 严格单调性 arrays = [] for group in part_: if len(group) < 2: # 用于special存在的情况 arrays.append(group) elif len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): arrays.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): arrays.append(group) except KeyError as error: logging.error(error) ivs = 0 df_last = None cut_last = [] if not arrays: return False for array in arrays: if special[0]: # 将特殊点加进去 if special[1] == 0.01: array[0] = special[0] + special[1] array.append(special[0]) array.sort() df_input_tmp = copy.deepcopy(df_input) out = pd.cut(single_series, array, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value ]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input_tmp[single_series.name] = out df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name, single_series.name) df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp) df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input_tmp) iv_sum = df_output["IV"].sum() if ivs < iv_sum: cut_last = array ivs = iv_sum df_last = copy.deepcopy(df_output) df_last.sort_index(inplace=True) return cut_last, df_last
def get_sample_bestks(df_, test): kwargs["cut_method"] = "bestks" print "*****************************{0}**************************************".format( kwargs["cut_method"]) part_, test = _CutMethods.cut_method_flow(test, kwargs, response="code", df_=df_) # groups为切割的组合,后面进行排列组合计算iv的值 df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"]) uncheck_len = 4 if kwargs["strict_monotonicity"]: # 严格单调性 uncheck_len = 3 arrays = [] for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(test, group, kwargs): arrays.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(test, group, kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code", kwargs) if _Basic.check_monotonic(tmp_woe, kwargs): arrays.append(group) except KeyError as error: logging.error(error) # 筛选出符合单调的要求和分布要求的,再筛选出iv最大的一个 ivs = 0 df_last = None for array in arrays: df_input_tmp = copy.deepcopy(df_input) out = pd.cut(test, array, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([kwargs["fill_value"] ]).fillna(kwargs["fill_value"]) out = out.cat.remove_unused_categories() df_input_tmp["test"] = out df_input_tmp = _Basic.get_pivot_table(df_input_tmp, "code", "test") df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp) df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input_tmp) iv_sum = df_output["IV"].sum() # print df_output if ivs < iv_sum: ivs = iv_sum df_last = copy.deepcopy(df_output) print df_last print "*****************************{0}**************************************".format( kwargs["cut_method"])
def __general_bin(self, single_series, y): array = [] df_input = _Basic.basic_prepare(y, self.good, self.bad) part_, single_series, special = CutMethods.cut_method_flow( single_series, **self.kwargs) uncheck_len = 2 if self.strict_monotonicity else 3 # 严格单调性 for group in part_: if len(group) < 2: # 用于special存在的情况 array.append(group) elif len(group) <= uncheck_len: if _Basic.check_proportion(single_series, group, self.kwargs): array.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(single_series, group, self.kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, single_series, group, y.name, self.kwargs) if _Basic.check_monotonic(tmp_woe, self.kwargs): array.append(group) except KeyError as error: logging.error(error) if not array: return False cut_points = array[-1] if special[0]: # 将特殊点加进去 if special[1] == 0.01: cut_points[0] = special[0] + special[1] cut_points.append(special[0]) cut_points.sort() out = pd.cut(single_series, cut_points, include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value) out = out.cat.remove_unused_categories() df_input[single_series.name] = out df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name) df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input) df_output.sort_index(inplace=True) return cut_points, df_output
def get_sample(df_, test): kwargs["cut_method"] = "quantile" print "*****************************{0}**************************************".format( kwargs["cut_method"]) df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"]) array = [] part_, test = _CutMethods.cut_method_flow(test, kwargs) # part_各个分位数的值,test原始数据 uncheck_len = 4 if kwargs["strict_monotonicity"]: # 严格单调性 uncheck_len = 3 for group in part_: if len(group) <= uncheck_len: if _Basic.check_proportion(test, group, kwargs): array.append(group) else: try: # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05 if _Basic.check_proportion(test, group, kwargs): tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code", kwargs) if _Basic.check_monotonic(tmp_woe, kwargs): array.append(group) # else: # break # else: # 不满足单调舍弃 # break except KeyError as error: logging.error(error) print(array) out = pd.cut(test, array[-1], include_lowest=True) # 只留下符合单调的切分后重新切分得到结果 out = out.cat.add_categories([kwargs["fill_value"] ]).fillna(kwargs["fill_value"]) out = out.cat.remove_unused_categories() df_input["test"] = out df_input = _Basic.get_pivot_table(df_input, "code", "test") df_input = _Basic.add_basic_info_to_df(df_input) df_input = _Basic.add_woe_iv_to_df(df_input) # 上面都是重复之前的操作 df_output = _Basic.add_ks_to_df(df_input) print df_output print "*****************************{0}**************************************".format( kwargs["cut_method"])