Example #1
0
 def _get_info_table(df_, column, response):
     df_ = copy.deepcopy(df_)
     df_ = _Basic.get_pivot_table(df_, column=column, response=response)
     df_ = _Basic.add_basic_info_to_df(df_)
     # df_ = _Basic.add_woe_iv_to_df(df_)
     df_["cum_good_percent"] = df_["Good_count"].cumsum().div(
         np.dot(df_["Good_count"].sum(), np.ones(df_.shape[0])))
     df_["cum_bad_percent"] = df_["Bad_count"].cumsum().div(
         np.dot(df_["Bad_count"].sum(), np.ones(df_.shape[0])))
     df_["KS"] = df_["cum_good_percent"].sub(df_["cum_bad_percent"]).abs()
     return df_
Example #2
0
    def __bestks_bin(self, single_series, y):
        df_input = _Basic.basic_prepare(y, self.good, self.bad)
        part_, single_series = _CutMethods.cut_method_flow(single_series,
                                                           self.kwargs,
                                                           response=y.name,
                                                           df_=pd.DataFrame(y))
        uncheck_len = 3 if self.strict_monotonicity else 4  # 严格单调性
        arrays = []
        for group in part_:
            if len(group) <= uncheck_len:
                if _Basic.check_proportion(single_series, group, self.kwargs):
                    arrays.append(group)
            else:
                try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                    if _Basic.check_proportion(single_series, group,
                                               self.kwargs):
                        tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                     group, y.name,
                                                     self.kwargs)
                        if _Basic.check_monotonic(tmp_woe, self.kwargs):
                            arrays.append(group)
                except KeyError as error:
                    logging.error(error)

        ivs = 0
        df_last = None
        cut_last = []
        if not arrays:
            return False
        for array in arrays:
            df_input_tmp = copy.deepcopy(df_input)
            out = pd.cut(single_series,
                         array,
                         include_lowest=True,
                         right=False)  # 只留下符合单调的切分后重新切分得到结果
            out = out.cat.add_categories([self.fill_value
                                          ]).fillna(self.fill_value)
            out = out.cat.remove_unused_categories()
            df_input_tmp[single_series.name] = out
            df_input_tmp = _Basic.get_pivot_table(df_input_tmp, y.name,
                                                  single_series.name)
            df_input_tmp = _Basic.add_basic_info_to_df(df_input_tmp)
            df_input_tmp = _Basic.add_woe_iv_to_df(df_input_tmp)
            # 上面都是重复之前的操作
            df_output = _Basic.add_ks_to_df(df_input_tmp)
            iv_sum = df_output["IV"].sum()
            # print df_output
            if ivs < iv_sum:
                cut_last = array
                ivs = iv_sum
                df_last = copy.deepcopy(df_output)
        # print(df_last)
        return cut_last, df_last
Example #3
0
 def __general_bin(self, single_series, y):
     array = []
     df_input = _Basic.basic_prepare(y, self.good, self.bad)
     part_, single_series = _CutMethods.cut_method_flow(
         single_series, self.kwargs)
     uncheck_len = 3 if self.strict_monotonicity else 4  # 严格单调性
     for group in part_:
         if len(group) <= uncheck_len:
             if _Basic.check_proportion(single_series, group, self.kwargs):
                 array.append(group)
         else:
             try:  # 只检测了单调性,可以增加检测分组的数量,增加个参数每个组至少0.05
                 if _Basic.check_proportion(single_series, group,
                                            self.kwargs):
                     tmp_woe = _Basic.get_tmp_woe(df_input, single_series,
                                                  group, y.name,
                                                  self.kwargs)
                     if _Basic.check_monotonic(tmp_woe, self.kwargs):
                         array.append(group)
             except KeyError as error:
                 logging.error(error)
     if not array:
         return False
     out = pd.cut(single_series, array[-1],
                  include_lowest=True)  # 只留下符合单调的切分后重新切分得到结果
     out = out.cat.add_categories([self.fill_value]).fillna(self.fill_value)
     out = out.cat.remove_unused_categories()
     df_input[single_series.name] = out
     df_input = _Basic.get_pivot_table(df_input, y.name, single_series.name)
     print(df_input)
     df_input = _Basic.add_basic_info_to_df(df_input)
     df_input = _Basic.add_woe_iv_to_df(df_input)
     # 上面都是重复之前的操作
     df_output = _Basic.add_ks_to_df(df_input)
     # print(df_output)
     return array[-1], df_output
Example #4
0
def get_sheet1_and_sheet2(df_train,
                          df_test=None,
                          drop_cols=None,
                          bad=1,
                          good=0,
                          cut_method="cumsum",
                          max_cut_part=10,
                          min_group_num=3,
                          group_min_percent=0.05,
                          check_monotonicity=True,
                          strict_monotonicity=True,
                          fill_value="-999",
                          response='code',
                          enable_iv_threshold=True,
                          iv_min=0.02,
                          keep_largest=2000,
                          enable_single_threshold=True,
                          single_threshold=0.8,
                          include_none=True,
                          n_jobs=4):
    """
    对数据进行合并,生成sheet1和sheet2用来生成excel和csv
    """
    if drop_cols is None:
        drop_cols = ["uid", "umobile", "date"]
    else:
        drop_cols = list(set(["uid", "umobile", "date"] + drop_cols))
    df_train = df_train.drop(drop_cols, axis=1)
    df_train_num = df_train.select_dtypes(exclude=["object"])
    if response not in df_train_num:
        df_train_num[response] = df_train[response]
    df_train_obj = df_train.select_dtypes(include=["object"])
    if response not in df_train_obj:
        df_train_obj[response] = df_train[response]
    num_cols = list(df_train_num.drop(response, axis=1).columns)
    sheet1_cols = [
        'var', 'iv', 'ks', 'group_num', 'missing_percent',
        'max_single_percent', 'min', '25%', '50%', '75%', 'max', 'std', 'mean',
        'mode', 'median'
    ]
    sheet2_cols = [
        'Bad_count', 'Good_count', 'IV', 'KS', 'WOE', 'cum_bad_percent',
        'cum_good_percent', 'default_percent', 'inside_bad_percent',
        'inside_good_percent', 'total_percent_x', 'total_x', 'var_name',
        'var_new_x', 'var_scope'
    ]
    if df_test is not None:
        sheet1_cols.insert(3, 'psi')
        sheet2_cols.insert(4, 'PSI')
        sheet2_cols.append('var_new_y')
        sheet2_cols.append('total_y')
        sheet2_cols.append('total_percent_y')
    log("开始进行连续性变量分箱操作:{0}".format(cut_method))
    binning = Binning(cut_method=cut_method,
                      strict_monotonicity=strict_monotonicity,
                      min_proportion=group_min_percent,
                      bad=bad,
                      good=good,
                      max_cut_part=max_cut_part,
                      fill_value=fill_value)
    results = binning.fit(df_train,
                          df_train[response],
                          columns=num_cols,
                          n_jobs=n_jobs)
    # results 结构为 {name:(bins,df),name:(bins,df)}
    log("分箱结束!开始进行连续性数据聚合...")
    sheet1 = {}
    sheet2 = {}
    new_bins = {}
    for name, value in results.iteritems(
    ):  # name:变量名,value: ([分箱的点],分箱完的DataFrame)
        if value is False:  # 如果分箱失败
            continue
        df_ = value[1].sort_index()  # 添加到sheet2当中
        s_ = df_train[name]
        df_["total_x"] = df_["Bad_count"] + df_["Good_count"]
        df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum()

        if enable_single_threshold is True:  # 单一性验证
            if df_["total_percent_x"].max() > single_threshold:
                continue

        if enable_iv_threshold is True:  # iv筛选
            if df_["IV"].replace([np.inf, -np.inf], 0).sum() < iv_min:
                continue

        if df_.shape[0] < min_group_num:  # 最小分组筛选
            continue
        asC_name = name + "_asC"
        # 计算第二张表
        df_["var_name"] = [asC_name] * df_.shape[0]
        df_["var_new_x"] = df_.index
        df_[asC_name] = df_["var_new_x"].cat.codes
        df_.set_index(asC_name, inplace=True)
        # 最大和最小值修改为极大极小
        cut_points = value[0]
        cut_points[0] = -np.inf
        cut_points[-1] = np.inf
        new_bins[name] = cut_points  # 替换原先没有无穷的切割点
        if df_test is not None:
            tmp_c = pd.cut(df_test[name], cut_points)
        else:
            tmp_c = pd.cut(s_, cut_points)

        if len(tmp_c.cat.categories) + 1 == df_.shape[0]:
            tmp_c = tmp_c.cat.add_categories([fill_value]).fillna(fill_value)

        df_["var_scope"] = tmp_c.cat.categories
        # 挑选指定顺序的列
        if df_test is not None:
            tmp_c = tmp_c.to_frame()
            tmp_c.columns = ['bin']
            tmp_c[response] = df_test[response]
            tmp_c["_tmp_count_"] = np.ones(
                tmp_c.shape[0])  # 给定一个具体的个数的1,可以指定数据类型
            tmp_pivot_df = pd.pivot_table(tmp_c,
                                          index='bin',
                                          columns=response,
                                          aggfunc=len,
                                          values="_tmp_count_").fillna(0)
            tmp_pivot_df['total'] = tmp_pivot_df[good] + tmp_pivot_df[bad]
            df_['total_y'] = tmp_pivot_df['total'].values
            df_['total_percent_y'] = df_['total_y'].div(df_['total_y'].sum())
            df_['PSI'] = df_["total_percent_x"].div(
                df_["total_percent_y"]).map(np.log).mul(
                    df_["total_percent_x"].sub(df_["total_percent_y"]))
            df_['var_new_y'] = df_['var_new_x']

        sheet2[name] = df_[sheet2_cols].replace([np.inf, -np.inf, np.nan], 0)
        # 计算第一张表
        r = s_.describe().to_frame().T  # 添加到sheet1当中
        r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum()
        r["ks"] = df_["KS"].max()
        r["group_num"] = df_.shape[0]
        r["missing_percent"] = 1 - r["count"] / s_.shape[0]
        r["max_single_percent"] = (df_["total_x"] / df_["total_x"].sum()).max()
        r["mode"] = ','.join(s_.mode().astype(str).values)  # s_.mode()
        r["median"] = s_.median()
        r["var"] = asC_name
        if df_test is not None:
            r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum()

        sheet1[name] = r[sheet1_cols]

    log("开始稀疏变量分箱操作")
    df_train_obj[response] = df_train_obj[response].replace(
        good, "Good_count").replace(bad, "Bad_count")
    for name in df_train_obj.columns:  # i_province2
        if enable_single_threshold is True:  # 单一值筛选
            max_count = float(
                df_train_obj.groupby(name)[response].count().max())
            if max_count / df_train_obj.shape[0] > single_threshold:
                continue
        tmp_c = _Basic.basic_prepare(df_train_obj[response],
                                     bad=bad,
                                     good=good)
        tmp_c[name] = df_train_obj[name]
        tmp_c = _Basic.get_pivot_table(tmp_c, response=response, column=name)
        tmp_c = _Basic.add_basic_info_to_df(tmp_c)
        tmp_c = _Basic.add_woe_iv_to_df(tmp_c)
        tmp_c = _Basic.add_ks_to_df(tmp_c)
        print(name)
    # todo:字符串变量筛选(如省份)

    return sheet1, sheet2, new_bins
Example #5
0
    def merge_asD(self, results):
        show("分箱结束!开始进行数值型数据聚合...")
        sheet1 = {}
        sheet2 = {}
        new_bins = {}
        pg = Progressive(len(results), step=2)
        speed_of_progress = 0
        for name, value in results.iteritems(
        ):  # name:变量名,value: ([分箱的点],分箱完的DataFrame)
            pg.bar(speed_of_progress, "数值型数据聚合")
            speed_of_progress += 1
            df_ = value[1]
            s_ = self.df_train[name]
            asD_name = name + "_asD"
            if value[0] is False:  # 如果分箱失败
                continue
            elif value[0] == 'single':
                df_["total_x"] = df_["Bad_count"] + df_["Good_count"]
                df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum()
                if self.enable_single_threshold is True:  # 单一性验证
                    if df_["total_percent_x"].max() > self.single_threshold:
                        continue
                if self.enable_iv_threshold is True:  # iv筛选
                    if df_["IV"].replace([np.inf, -np.inf],
                                         0).sum() < self.iv_min:
                        continue
                df_["var_name"] = [asD_name] * df_.shape[0]
                df_["var_new_x"] = df_.index
                df_["var_scope"] = df_.index
                df_[asD_name] = range(0, df_.shape[0])
                df_.set_index(asD_name, inplace=True)
                if self.df_test is not None:
                    tmp_df = s_.to_frame().fillna(self.fill_value)
                    tmp_df["_tmp_count_"] = np.ones(tmp_df.shape[0])
                    tmp_df = tmp_df.groupby(name).count()
                    tmp_df.columns = ['total_y']
                    df_ = pd.concat([df_, tmp_df], axis=1)
                    df_['total_percent_y'] = df_['total_y'].div(
                        df_['total_y'].sum())
                    df_['PSI'] = df_["total_percent_x"].div(
                        df_["total_percent_y"]).map(np.log).mul(
                            df_["total_percent_x"].sub(df_["total_percent_y"]))
                    df_['var_new_y'] = df_['var_new_x']
                    sheet2[name] = df_[self.sheet2_cols].replace(
                        [np.inf, -np.inf, np.nan], 0)
                new_bins[name] = "single"
                sheet2[name] = df_[self.sheet2_cols].replace(
                    [np.inf, -np.inf, np.nan], 0)
                r = s_.describe().to_frame().T  # 添加到sheet1当中
                r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum()
                r["ks"] = df_["KS"].max()
                r["group_num"] = df_.shape[0]
                r["missing_percent"] = 1 - r["count"] / s_.shape[0]
                r["max_single_percent"] = (df_["total_x"] /
                                           df_["total_x"].sum()).max()
                r["mode"] = ','.join(s_.mode().astype(str).values)  # s_.mode()
                r["var"] = asD_name
                if self.df_test is not None:
                    r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum()
                try:
                    sheet1[name] = r[[
                        'var', "iv", 'ks', 'psi', 'group_num',
                        'missing_percent', 'max_single_percent'
                    ]]
                except Exception as e:
                    sheet1[name] = r[[
                        'var', "iv", 'ks', 'group_num', 'missing_percent',
                        'max_single_percent'
                    ]]

            elif isinstance(value[0], dict):
                groups = value[0]
                df_["total_x"] = df_["Bad_count"] + df_["Good_count"]
                df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum()
                # if self.enable_single_threshold is True:  # 单一性验证
                #     if df_["total_percent_x"].max() > self.single_threshold:
                #         continue
                if self.enable_iv_threshold is True:  # iv筛选
                    if df_["IV"].replace([np.inf, -np.inf],
                                         0).sum() < self.iv_min:
                        continue
                # 计算第二张表
                df_["var_name"] = [asD_name] * df_.shape[0]
                groups_ = {k: '|'.join(v) for k, v in groups.iteritems()}
                df_["var_new_x"] = pd.Series(groups_)
                df_["var_scope"] = pd.Series(groups_)
                df_.index.name = asD_name
                if self.df_test is not None:
                    test_df_ = _Basic.basic_prepare(
                        self.df_test[self.response],
                        bad=self.bad,
                        good=self.good)
                    test_df_[name] = self.df_test[name]
                    for code, items in groups.iteritems():
                        test_df_[name] = test_df_[name].replace(items, code)
                    test_df_ = _Basic.get_pivot_table(test_df_, self.response,
                                                      name)
                    test_df_ = _Basic.add_basic_info_to_df(test_df_)
                    df_['total_y'] = test_df_['total'].values
                    df_['total_percent_y'] = df_['total_y'].div(
                        df_['total_y'].sum())
                    df_['PSI'] = df_["total_percent_x"].div(
                        df_["total_percent_y"]).map(np.log).mul(
                            df_["total_percent_x"].sub(df_["total_percent_y"]))
                    df_['var_new_y'] = df_['var_new_x']
                new_bins[name] = groups
                sheet2[name] = df_[self.sheet2_cols].replace(
                    [np.inf, -np.inf, np.nan], 0)
                # 计算第一张表
                r = s_.describe().to_frame().T  # 添加到sheet1当中
                r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum()
                r["ks"] = df_["KS"].max()
                r["group_num"] = df_.shape[0]
                r["missing_percent"] = 1 - r["count"] / s_.shape[0]
                r["max_single_percent"] = (df_["total_x"] /
                                           df_["total_x"].sum()).max()
                r["mode"] = ','.join(s_.mode().astype(str).values)  # s_.mode()
                r["var"] = asD_name
                if self.df_test is not None:
                    r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum()
                    sheet1[name] = r[[
                        'var', "iv", 'ks', 'psi', 'group_num',
                        'missing_percent', 'max_single_percent'
                    ]]
                else:
                    sheet1[name] = r[[
                        'var', "iv", 'ks', 'group_num', 'missing_percent',
                        'max_single_percent'
                    ]]
            else:
                raise ValueError("分箱返回的结果不符合预期{0}".format(type(value[0])))

        return sheet1, sheet2, new_bins
Example #6
0
 def _bin_merge_asD(self, df_train_obj, columns):
     log("开始数值型变量分箱操作")
     sheet1 = {}
     sheet2 = {}
     new_bins = {}
     # df_train_obj[self.response] = df_train_obj[self.response].replace(self.good, "Good_count").replace(self.bad, "Bad_count")
     for name in columns:  # i_province2
         if self.enable_single_threshold is True:  # 单一值筛选
             max_count = float(
                 df_train_obj.groupby(name)[self.response].count().max())
             if max_count / df_train_obj.shape[0] > self.single_threshold:
                 continue
         # 计算每个类型的占比,计算ks,iv等信息
         tmp_c = _Basic.basic_prepare(df_train_obj[self.response],
                                      bad=self.bad,
                                      good=self.good)
         tmp_c[name] = df_train_obj[name]
         tmp_c = _Basic.get_pivot_table(tmp_c,
                                        response=self.response,
                                        column=name)
         tmp_c = _Basic.add_basic_info_to_df(tmp_c)
         tmp_c = _Basic.add_woe_iv_to_df(tmp_c)
         if self.woe_fill == 'avg':  # 填充woe的值
             avg = tmp_c["WOE"].replace(self.fill_items, np.nan).mean()
             tmp_c["WOE"] = tmp_c["WOE"].replace(self.fill_items, avg)
         tmp_c = _Basic.add_ks_to_df(tmp_c)
         # 利用上面生成的woe进行分箱,挑选iv最大的组
         parts = []
         last_cut_points = []
         max_iv = 0
         parts, _ = _CutMethods.quantile_cut_flow(
             tmp_c['WOE'], parts, max_cut_part=self.max_cut_part)
         parts, _ = _CutMethods.cumsum_cut_flow(
             tmp_c['WOE'],
             parts,
             add_min_group=True,
             max_cut_part=self.max_cut_part)
         for part in parts:  # 目前是取iv最大的组,未来可以加取分组数最多的组
             bins_ = pd.cut(tmp_c['WOE'], part, include_lowest=True)
             tmp_c['bins'] = bins_.cat.codes
             tmp_df = pd.pivot_table(tmp_c,
                                     values=['Bad_count', 'Good_count'],
                                     index='bins',
                                     aggfunc=np.sum)
             tmp_df = _Basic.add_basic_info_to_df(tmp_df)
             tmp_df = _Basic.add_woe_iv_to_df(tmp_df)
             iv_ = tmp_df['IV'].sum()
             if iv_ > max_iv:
                 last_cut_points = part
         # 开始构建sheet1和sheet2
         tmp_c['bins'] = pd.cut(tmp_c['WOE'],
                                last_cut_points,
                                include_lowest=True).cat.codes
         groups = {}
         for i in tmp_c['bins'].unique():
             items_ = tmp_c[tmp_c['bins'] == i].index.tolist()
             groups[i] = items_
         df_ = _Basic.basic_prepare(df_train_obj[self.response],
                                    bad=self.bad,
                                    good=self.good)
         df_[name] = df_train_obj[name]
         for code, items in groups.iteritems():
             df_[name] = df_[name].replace(items, code)
         df_ = _Basic.get_pivot_table(df_, self.response, name)
         df_ = _Basic.add_basic_info_to_df(df_)
         df_ = _Basic.add_woe_iv_to_df(df_)
         df_ = _Basic.add_ks_to_df(df_)
         df_["total_x"] = df_["Bad_count"] + df_["Good_count"]
         df_["total_percent_x"] = df_["total_x"] / df_["total_x"].sum()
         # 计算第二张表
         asD_name = name + "_asD"
         df_["var_name"] = [asD_name] * df_.shape[0]
         groups_ = {k: '|'.join(v) for k, v in groups.iteritems()}
         df_["var_new_x"] = pd.Series(groups_)
         df_["var_scope"] = pd.Series(groups_)
         if self.df_test is not None:
             test_df_ = _Basic.basic_prepare(self.df_test[self.response],
                                             bad=self.bad,
                                             good=self.good)
             test_df_[name] = self.df_test[name]
             for code, items in groups.iteritems():
                 test_df_[name] = test_df_[name].replace(items, code)
             test_df_ = _Basic.get_pivot_table(test_df_, self.response,
                                               name)
             test_df_ = _Basic.add_basic_info_to_df(test_df_)
             df_['total_y'] = test_df_['total'].values
             df_['total_percent_y'] = df_['total_y'].div(
                 df_['total_y'].sum())
             df_['PSI'] = df_["total_percent_x"].div(
                 df_["total_percent_y"]).map(np.log).mul(
                     df_["total_percent_x"].sub(df_["total_percent_y"]))
             df_['var_new_y'] = df_['var_new_x']
         new_bins[name] = groups
         sheet2[name] = df_[self.sheet2_cols].replace(
             [np.inf, -np.inf, np.nan], 0)
         # 计算第一张表
         s_ = df_train_obj[name]
         r = s_.describe().to_frame().T  # 添加到sheet1当中
         r["iv"] = df_["IV"].replace([np.inf, -np.inf], 0).sum()
         r["ks"] = df_["KS"].max()
         r["group_num"] = df_.shape[0]
         r["missing_percent"] = 1 - r["count"] / s_.shape[0]
         r["max_single_percent"] = (df_["total_x"] /
                                    df_["total_x"].sum()).max()
         r["mode"] = ','.join(s_.mode().astype(str).values)  # s_.mode()
         r["var"] = asD_name
         if self.df_test is not None:
             r["psi"] = df_['PSI'].replace([np.inf, -np.inf], 0).sum()
         try:
             sheet1[name] = r[[
                 'var', "iv", 'ks', 'psi', 'group_num', 'missing_percent',
                 'max_single_percent'
             ]]
         except Exception as e:
             sheet1[name] = r[[
                 'var', "iv", 'ks', 'group_num', 'missing_percent',
                 'max_single_percent'
             ]]
     return sheet1, sheet2, new_bins