Beispiel #1
0
    def __bestks_bin(self, single_series, y):
        df_input = _Basic.basic_prepare(y, self.good, self.bad)
        self.kwargs['df_'] = pd.DataFrame(y)
        self.kwargs['response'] = y.name
        part_, single_series, special = CutMethods.cut_method_flow(
            single_series, **self.kwargs)
        uncheck_len = 2 if self.strict_monotonicity else 3  # 严格单调性
        arrays = []
        for group in part_:
            if len(group) < 2:  # 用于special存在的情况
                arrays.append(group)
            elif len(group) <= uncheck_len:
                if _Basic.check_proportion(single_series, group, self.kwargs):
                    arrays.append(group)
            else:
                try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                    if _Basic.check_proportion(single_series, group,
                                               self.kwargs):
                        tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                     group, y.name,
                                                     self.kwargs)
                        if _Basic.check_monotonic(tmp_woe, self.kwargs):
                            arrays.append(group)
                except KeyError as error:
                    logging.error(error)

        ivs = 0
        df_last = None
        cut_last = []
        if not arrays:
            return False
        for array in arrays:
            if special[0]:  # 将特殊点加进去
                if special[1] == 0.01:
                    array[0] = special[0] + special[1]
                array.append(special[0])
                array.sort()
            df_input_tmp = copy.deepcopy(df_input)
            out = pd.cut(single_series, array,
                         include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
            out = out.cat.add_categories([self.fill_value
                                          ]).fillna(self.fill_value)
            out = out.cat.remove_unused_categories()
            df_input_tmp[single_series.name] = out
            df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name,
                                                  single_series.name)
            df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp)
            df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp)
            # 上面都是重复之前的操作
            df_output = _Basic.add_ks_to_df(df_input_tmp)
            iv_sum = df_output["IV"].sum()
            if ivs < iv_sum:
                cut_last = array
                ivs = iv_sum
                df_last = copy.deepcopy(df_output)
        df_last.sort_index(inplace=True)
        return cut_last, df_last
Beispiel #2
0
def __get_tmp_woe(df_n, series_, group, response):
    df_n[series_.name] = pd.cut(series_, group, include_lowest=True)
    df_n[series_.name] = df_n[series_.name].cat.add_categories(
        [kwargs["fill_value"]]).fillna(kwargs["fill_value"])
    df_n[series_.name] = df_n[series_.name].cat.remove_unused_categories()
    tmp_table = _Basic.get_pivot_table(df_n, response, series_.name)
    tmp_table = _Basic.add_basic_info_to_df(tmp_table)
    tmp_table = _Basic.add_woe_iv_to_df(tmp_table)
    tmp_table = tmp_table.dropna()
    tmp_woe = tmp_table[tmp_table.index != kwargs["fill_value"]]["WOE"]
    df_n[series_.name] = series_
    return tmp_woe
Beispiel #3
0
 def single_bin(self, single_series, y):
     """当一列的类别<=12种的时候,直接对这些数据进行分组
     :return: "single",df
     """
     df_input = _Basic.basic_prepare(y, self.good, self.bad)
     df_input[single_series.name] = single_series
     df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name)
     df_input = _Basic.add_basic_info_to_df(df_input)
     df_input = _Basic.add_woe_iv_to_df(df_input)
     df_input = _Basic.add_ks_to_df(df_input)
     df_input = df_input.sort_index()
     return 'single', df_input
Beispiel #4
0
def get_sample_bestks(df_, test):
    kwargs["cut_method"] = "bestks"
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
    part_, test = _CutMethods.cut_method_flow(test,
                                              kwargs,
                                              response="code",
                                              df_=df_)
    # groups为切割的组合,后面进行排列组合计算iv的值
    df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"])
    uncheck_len = 4
    if kwargs["strict_monotonicity"]:  # 严格单调性
        uncheck_len = 3
    arrays = []
    for group in part_:
        if len(group) <= uncheck_len:
            if _Basic.check_proportion(test, group, kwargs):
                arrays.append(group)
        else:
            try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                if _Basic.check_proportion(test, group, kwargs):
                    tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code",
                                                 kwargs)
                    if _Basic.check_monotonic(tmp_woe, kwargs):
                        arrays.append(group)
            except KeyError as error:
                logging.error(error)
    # 筛选出符合单调的要求和分布要求的,再筛选出iv最大的一个
    ivs = 0
    df_last = None
    for array in arrays:
        df_input_tmp = copy.deepcopy(df_input)
        out = pd.cut(test, array, include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
        out = out.cat.add_categories([kwargs["fill_value"]
                                      ]).fillna(kwargs["fill_value"])
        out = out.cat.remove_unused_categories()
        df_input_tmp["test"] = out
        df_input_tmp = _Basic.get_pivot_table(df_input_tmp, "code", "test")
        df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp)
        df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp)
        # 上面都是重复之前的操作
        df_output = _Basic.add_ks_to_df(df_input_tmp)
        iv_sum = df_output["IV"].sum()
        # print df_output
        if ivs < iv_sum:
            ivs = iv_sum
            df_last = copy.deepcopy(df_output)
    print df_last
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
Beispiel #5
0
 def __general_bin(self, single_series, y):
     array = []
     df_input = _Basic.basic_prepare(y, self.good, self.bad)
     part_, single_series, special = CutMethods.cut_method_flow(
         single_series, **self.kwargs)
     uncheck_len = 2 if self.strict_monotonicity else 3  # 严格单调性
     for group in part_:
         if len(group) < 2:  # 用于special存在的情况
             array.append(group)
         elif len(group) <= uncheck_len:
             if _Basic.check_proportion(single_series, group, self.kwargs):
                 array.append(group)
         else:
             try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                 if _Basic.check_proportion(single_series, group,
                                            self.kwargs):
                     tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                  group, y.name,
                                                  self.kwargs)
                     if _Basic.check_monotonic(tmp_woe, self.kwargs):
                         array.append(group)
             except KeyError as error:
                 logging.error(error)
     if not array:
         return False
     cut_points = array[-1]
     if special[0]:  # 将特殊点加进去
         if special[1] == 0.01:
             cut_points[0] = special[0] + special[1]
         cut_points.append(special[0])
         cut_points.sort()
     out = pd.cut(single_series, cut_points,
                  include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
     out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value)
     out = out.cat.remove_unused_categories()
     df_input[single_series.name] = out
     df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name)
     df_input = _Basic.add_basic_info_to_df(df_input)
     df_input = _Basic.add_woe_iv_to_df(df_input)
     # 上面都是重复之前的操作
     df_output = _Basic.add_ks_to_df(df_input)
     df_output.sort_index(inplace=True)
     return cut_points, df_output
Beispiel #6
0
def get_sample(df_, test):
    kwargs["cut_method"] = "quantile"
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
    df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"])
    array = []
    part_, test = _CutMethods.cut_method_flow(test, kwargs)
    # part_各个分位数的值,test原始数据
    uncheck_len = 4
    if kwargs["strict_monotonicity"]:  # 严格单调性
        uncheck_len = 3

    for group in part_:
        if len(group) <= uncheck_len:
            if _Basic.check_proportion(test, group, kwargs):
                array.append(group)
        else:
            try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                if _Basic.check_proportion(test, group, kwargs):
                    tmp_woe = _Basic.get_tmp_woe(df_input, test, group, "code",
                                                 kwargs)
                    if _Basic.check_monotonic(tmp_woe, kwargs):
                        array.append(group)
                        #     else:
                        #         break
                        # else:  # 不满足单调舍弃
                        #     break
            except KeyError as error:
                logging.error(error)
    print(array)
    out = pd.cut(test, array[-1], include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
    out = out.cat.add_categories([kwargs["fill_value"]
                                  ]).fillna(kwargs["fill_value"])
    out = out.cat.remove_unused_categories()
    df_input["test"] = out
    df_input = _Basic.get_pivot_table(df_input, "code", "test")
    df_input = _Basic.add_basic_info_to_df(df_input)
    df_input = _Basic.add_woe_iv_to_df(df_input)
    # 上面都是重复之前的操作
    df_output = _Basic.add_ks_to_df(df_input)
    print df_output
    print "*****************************{0}**************************************".format(
        kwargs["cut_method"])
Beispiel #7
0
def get_sample():
    df_ = pd.DataFrame()
    df_["code"] = np.zeros(1000)
    df_["code"][:300] = 1
    test = pd.Series(np.random.rand(1000), name="test")
    test[270:350] = np.NaN

    df_input = _Basic.basic_prepare(df_["code"], kwargs["good"], kwargs["bad"])
    test = test.round(3)

    array = []
    part_, test = _CutMethods.cut_method_flow(test, kwargs)

    uncheck_len = 4
    if kwargs["strict_monotonicity"]:
        uncheck_len = 3

    for group in part_:
        if len(group) <= uncheck_len:
            array.append(group)
        else:
            try:
                tmp_woe = __get_tmp_woe(df_input, test, group, "code")
                if __check_monotonic(tmp_woe):
                    array.append(group)
                else:
                    break
            except KeyError as error:
                logging.error(error)

    out = pd.cut(test, array[-1], include_lowest=True)
    out = out.cat.add_categories([kwargs["fill_value"]
                                  ]).fillna(kwargs["fill_value"])
    out = out.cat.remove_unused_categories()
    df_input["test"] = out
    df_input = _Basic.get_pivot_table(df_input, "code", "test")
    df_input = _Basic.add_basic_info_to_df(df_input)
    df_input = _Basic.add_woe_iv_to_df(df_input)
    df_input = _Basic.add_ks_to_df(df_input)
    print df_input
Beispiel #8
0
    def woe_bin(self, single_series, y):

        df_input = _Basic.basic_prepare(y, bad=self.bad, good=self.good)
        df_input[single_series.name] = single_series
        df_input = _Basic.get_pivot_table(df_input,
                                          response=y.name,
                                          column=single_series.name)
        df_input = _Basic.add_basic_info_to_df(df_input)
        df_input = _Basic.add_woe_iv_to_df(df_input)
        if self.woe_inf_fill == 'avg':  # todo:填充woe的值,现在是平均值
            avg = df_input["WOE"].replace(self.fill_items, np.nan).mean()
            df_input["WOE"] = df_input["WOE"].replace(self.fill_items, avg)
        df_input = _Basic.add_ks_to_df(df_input)

        last_cut_points = []
        choice_item_value = 0
        parts1 = CutMethods.quantile_cut_flow(df_input['WOE'],
                                              max_cut_part=self.max_cut_part,
                                              min_cut_part=self.min_cut_part)
        parts2 = CutMethods.cumsum_cut_flow(df_input['WOE'],
                                            add_min_group=True,
                                            max_cut_part=self.max_cut_part,
                                            min_cut_part=self.min_cut_part)
        parts = parts1 + parts2
        for part in parts:  # 目前是取iv最大的组,未来可以加取分组数最多的组
            part = sorted(list(set(part) - {-np.inf, np.inf, np.nan}))
            if len(part) <= 2:
                continue
            bins_ = pd.cut(df_input['WOE'], part, include_lowest=True)
            df_input['bins'] = bins_.cat.codes
            tmp_df = pd.pivot_table(df_input,
                                    values=['Bad_count', 'Good_count'],
                                    index='bins',
                                    aggfunc=np.sum)
            tmp_df = _Basic.add_basic_info_to_df(tmp_df)
            tmp_df['total_percent'] = tmp_df['total'].div(
                tmp_df['total'].sum())
            if tmp_df['total_percent'].min(
            ) < self.group_min_percent:  # 判断每组最小占比
                continue
            tmp_df = _Basic.add_woe_iv_to_df(tmp_df)
            tmp_df = _Basic.add_ks_to_df(tmp_df)
            if self.choice_by.lower() == "iv":
                item_value = tmp_df['IV'].sum()
            elif self.choice_by.lower() == 'woe':
                item_value = tmp_df['WOE'].sum()
            elif self.choice_by.lower() == 'ks':
                item_value = tmp_df['KS'].max()
            elif self.choice_by.lower() == 'len':
                item_value = tmp_df.shape[0]
            else:
                raise ValueError('"choice_by" support ["iv","woe","ks","len"],'
                                 'But get "{0}"'.format(self.choice_by))
            if item_value > choice_item_value:
                last_cut_points = part
        if last_cut_points == []:
            return False, False
        df_input['bins'] = pd.cut(df_input['WOE'],
                                  last_cut_points,
                                  include_lowest=True).cat.codes
        groups = {}  # 分组号和每组的列表{1:["ZJ","SH"],2:["XJ","BJ"]}
        for i in df_input['bins'].unique():
            items_ = df_input[df_input['bins'] == i].index.tolist()
            groups[i] = items_

        df_ = _Basic.basic_prepare(y, bad=self.bad, good=self.good)
        df_[single_series.name] = single_series

        for code, items in groups.iteritems():
            df_[single_series.name] = df_[single_series.name].replace(
                items, code)
        df_ = _Basic.get_pivot_table(df_, y.name, single_series.name)
        df_ = _Basic.add_basic_info_to_df(df_)
        df_ = _Basic.add_woe_iv_to_df(df_)
        df_ = _Basic.add_ks_to_df(df_)
        bins_ = pd.Series(['|'.join(i) for i in groups.values()],
                          index=[i for i in groups.keys()],
                          name='var_scope')
        df_ = pd.concat([df_, bins_], axis=1)
        return groups, df_  # ({1:["ZJ","SH"],2:["XJ","BJ"]}, df)