Ejemplo n.º 1
0
    def __bestks_bin(self, single_series, y):
        df_input = _Basic.basic_prepare(y, self.good, self.bad)
        self.kwargs['df_'] = pd.DataFrame(y)
        self.kwargs['response'] = y.name
        part_, single_series, special = CutMethods.cut_method_flow(
            single_series, **self.kwargs)
        uncheck_len = 2 if self.strict_monotonicity else 3  # 严格单调性
        arrays = []
        for group in part_:
            if len(group) < 2:  # 用于special存在的情况
                arrays.append(group)
            elif len(group) <= uncheck_len:
                if _Basic.check_proportion(single_series, group, self.kwargs):
                    arrays.append(group)
            else:
                try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                    if _Basic.check_proportion(single_series, group,
                                               self.kwargs):
                        tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                     group, y.name,
                                                     self.kwargs)
                        if _Basic.check_monotonic(tmp_woe, self.kwargs):
                            arrays.append(group)
                except KeyError as error:
                    logging.error(error)

        ivs = 0
        df_last = None
        cut_last = []
        if not arrays:
            return False
        for array in arrays:
            if special[0]:  # 将特殊点加进去
                if special[1] == 0.01:
                    array[0] = special[0] + special[1]
                array.append(special[0])
                array.sort()
            df_input_tmp = copy.deepcopy(df_input)
            out = pd.cut(single_series, array,
                         include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
            out = out.cat.add_categories([self.fill_value
                                          ]).fillna(self.fill_value)
            out = out.cat.remove_unused_categories()
            df_input_tmp[single_series.name] = out
            df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name,
                                                  single_series.name)
            df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp)
            df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp)
            # 上面都是重复之前的操作
            df_output = _Basic.add_ks_to_df(df_input_tmp)
            iv_sum = df_output["IV"].sum()
            if ivs < iv_sum:
                cut_last = array
                ivs = iv_sum
                df_last = copy.deepcopy(df_output)
        df_last.sort_index(inplace=True)
        return cut_last, df_last
Ejemplo n.º 2
0
def get_sample_bestks(df_, test):
    kwargs["cut_method"] = "bestks"
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
    part_, test = _CutMethods.cut_method_flow(test,
                                              kwargs,
                                              response="code",
                                              df_=df_)
    # groups为切割的组合,后面进行排列组合计算iv的值
    df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"])
    uncheck_len = 4
    if kwargs["strict_monotonicity"]:  # 严格单调性
        uncheck_len = 3
    arrays = []
    for group in part_:
        if len(group) <= uncheck_len:
            if _Basic.check_proportion(test, group, kwargs):
                arrays.append(group)
        else:
            try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                if _Basic.check_proportion(test, group, kwargs):
                    tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code",
                                                 kwargs)
                    if _Basic.check_monotonic(tmp_woe, kwargs):
                        arrays.append(group)
            except KeyError as error:
                logging.error(error)
    # 筛选出符合单调的要求和分布要求的,再筛选出iv最大的一个
    ivs = 0
    df_last = None
    for array in arrays:
        df_input_tmp = copy.deepcopy(df_input)
        out = pd.cut(test, array, include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
        out = out.cat.add_categories([kwargs["fill_value"]
                                      ]).fillna(kwargs["fill_value"])
        out = out.cat.remove_unused_categories()
        df_input_tmp["test"] = out
        df_input_tmp = _Basic.get_pivot_table(df_input_tmp, "code", "test")
        df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp)
        df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp)
        # 上面都是重复之前的操作
        df_output = _Basic.add_ks_to_df(df_input_tmp)
        iv_sum = df_output["IV"].sum()
        # print df_output
        if ivs < iv_sum:
            ivs = iv_sum
            df_last = copy.deepcopy(df_output)
    print df_last
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
Ejemplo n.º 3
0
 def __general_bin(self, single_series, y):
     array = []
     df_input = _Basic.basic_prepare(y, self.good, self.bad)
     part_, single_series, special = CutMethods.cut_method_flow(
         single_series, **self.kwargs)
     uncheck_len = 2 if self.strict_monotonicity else 3  # 严格单调性
     for group in part_:
         if len(group) < 2:  # 用于special存在的情况
             array.append(group)
         elif len(group) <= uncheck_len:
             if _Basic.check_proportion(single_series, group, self.kwargs):
                 array.append(group)
         else:
             try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                 if _Basic.check_proportion(single_series, group,
                                            self.kwargs):
                     tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                  group, y.name,
                                                  self.kwargs)
                     if _Basic.check_monotonic(tmp_woe, self.kwargs):
                         array.append(group)
             except KeyError as error:
                 logging.error(error)
     if not array:
         return False
     cut_points = array[-1]
     if special[0]:  # 将特殊点加进去
         if special[1] == 0.01:
             cut_points[0] = special[0] + special[1]
         cut_points.append(special[0])
         cut_points.sort()
     out = pd.cut(single_series, cut_points,
                  include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
     out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value)
     out = out.cat.remove_unused_categories()
     df_input[single_series.name] = out
     df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name)
     df_input = _Basic.add_basic_info_to_df(df_input)
     df_input = _Basic.add_woe_iv_to_df(df_input)
     # 上面都是重复之前的操作
     df_output = _Basic.add_ks_to_df(df_input)
     df_output.sort_index(inplace=True)
     return cut_points, df_output
Ejemplo n.º 4
0
def get_sample(df_, test):
    kwargs["cut_method"] = "quantile"
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
    df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"])
    array = []
    part_, test = _CutMethods.cut_method_flow(test, kwargs)
    # part_各个分位数的值,test原始数据
    uncheck_len = 4
    if kwargs["strict_monotonicity"]:  # 严格单调性
        uncheck_len = 3

    for group in part_:
        if len(group) <= uncheck_len:
            if _Basic.check_proportion(test, group, kwargs):
                array.append(group)
        else:
            try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                if _Basic.check_proportion(test, group, kwargs):
                    tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code",
                                                 kwargs)
                    if _Basic.check_monotonic(tmp_woe, kwargs):
                        array.append(group)
                        #     else:
                        #         break
                        # else:  # 不满足单调舍弃
                        #     break
            except KeyError as error:
                logging.error(error)
    print(array)
    out = pd.cut(test, array[-1], include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
    out = out.cat.add_categories([kwargs["fill_value"]
                                  ]).fillna(kwargs["fill_value"])
    out = out.cat.remove_unused_categories()
    df_input["test"] = out
    df_input = _Basic.get_pivot_table(df_input, "code", "test")
    df_input = _Basic.add_basic_info_to_df(df_input)
    df_input = _Basic.add_woe_iv_to_df(df_input)
    # 上面都是重复之前的操作
    df_output = _Basic.add_ks_to_df(df_input)
    print df_output
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])