Ejemplos de woebin en Python, ejemplos de scorecardpy.woebin en Python

Ejemplo n.º 1

0

Mostrar archivo

    def category_bin(self,bin_feature,max_num_bin=None):
        """
        无序类别变量分箱组合
        :param max_num_bin:int 最大分箱数
        :param bin_feature: list, 参与分箱的变量
        :return: bin_dict:dict, var_iv:dict
        """
        bin_feature = IC.check_list(bin_feature)
        t0 = time.process_time()
        bin_dict,var_iv = dict(),dict()
        df = self._df
        df[bin_feature] = df[bin_feature].astype("str") # 防止误将无需类别变量当做连续变量处理

        if max_num_bin == None:
            max_num_bin = self._max_num_bin
        else:
            max_num_bin = IC.check_int(max_num_bin)

        # 开始分箱
        for col in bin_feature:
            bin_dict[col] = woebin(dt=df[[col,self._target]],y=self._target,x=col,breaks_list=self._breaks_list,special_values=self._special_values,
                                   min_perc_fine_bin=self._min_per_fine_bin,min_perc_coarse_bin=self._min_per_coarse_bin,stop_limit=self._stop_limit,
                                   max_num_bin=max_num_bin,method=self._method)[col]
            var_iv[col] = bin_dict[col]["total_iv"].unique()[0]

        print("处理完{}个无序类别变量,耗时:{}秒".format(len(bin_feature),(time.process_time()-t0)*100/60))

        return bin_dict,var_iv

Ejemplo n.º 2

0

Mostrar archivo

    def self_bin(self,var,special_values=None,breaks_list=None):
        """
        指定断点
        :param var: str, 自定义分箱单变量
        :param special_values: list of dict
        :param breaks_list: list of dict
        :return: bin_dict:dict, var_iv:dict

    breaks_list = {
      'age.in.years': [26, 35, 37, "Inf%,%missing"],
      'housing': ["own", "for free%,%rent"]
      }
    special_values = {
      'credit.amount': [2600, 9960, "6850%,%missing"],
      'purpose': ["education", "others%,%missing"]
      }
        """
        var = IC.check_str(var)
        bin_dict, var_iv = dict(),dict()
        if special_values == None:
            special_values = self._special_values
        else:
            special_values = IC.check_special_values(special_values)
        if breaks_list == None:
            breaks_list = self._breaks_list
        else:
            breaks_list = IC.check_list_of_dict(breaks_list)

        bin_dict[var] = woebin(dt=self._df[[var,self._target]], y=self._target, x=var,
                               breaks_list=breaks_list, special_values=special_values)[var]
        # 保存IV
        var_iv[var] = bin_dict[var]["total_iv"].unique()[0]

        return bin_dict,var_iv

Ejemplo n.º 3

0

Mostrar archivo

    def cut_main(self):
        breaks_adj = {}
        for col in self.dist_col:
            if len([i for i in list(self.df_train[col].unique()) if i == i]) <= 1:
                self.log.info("{} has been deleted, because only has one single value".format(col))
                continue
            breaks_adj[col] = [i for i in list(self.df_train[col].unique()) if i == i]

        self.dist_col = list(breaks_adj.keys())
        cut_cols = self.dist_col + self.serial_col
        max_bins = self.max_bins
        self.woe_df = pd.DataFrame(columns=["variable", "bin", "woe", "bin_iv", "bad", "badprob"])
        self.bins_adj = {}
        while max_bins > 2 and len(cut_cols) > 0:
            self.log.info("现在是分{}箱, 有{}个变量需要分箱".format(max_bins, len(cut_cols)))
            if self.target_name not in cut_cols:
                cut_cols.append(self.target_name)
            bins_adj = sc.woebin(self.df_train[cut_cols], y=self.target_name, breaks_list=breaks_adj,
                                 bin_num_limit=max_bins, method=self.method)
            cut_cols = []
            for key, value in bins_adj.items():
                tmp = bins_adj[key].copy()
                if key in self.serial_col:
                    tmp1 = tmp[tmp.bin != "missing"]
                    if len(tmp1) == 1:
                        continue
                    else:
                        tmp1["min"] = tmp1.bin.map(
                            lambda x: float(x.split(",")[0].replace("[", "")) if x.find(",") > -1 else x)
                        tmp1 = tmp1.sort_values(by="min")
                        if not all(x < y for x, y in zip(tmp1.woe.tolist(), tmp1.woe.tolist()[1:])):
                            cut_cols.append(key)
                            continue
                self.woe_df = pd.concat([self.woe_df, tmp[["variable", "bin", "woe", "bin_iv", "bad", "badprob"]]])
            max_bins = max_bins - 1
            self.bins_adj = dict(self.bins_adj, **bins_adj)
        self.log.info("仍有{}个特征无法满足单调的需求，不能分箱".format(len(cut_cols)))
        self.woe_df.to_excel("{}_bin_result.xlsx".format(self.filename))
        self.log.info("*" * 50)
        self.log.info("WOE Detail can be checked in {}_bin_result.xlsx".format(self.filename))

        self.df_train_woe = sc.woebin_ply(self.df_train, self.bins_adj)
        self.df_train_woe.columns = [i.replace("_woe", "") for i in self.df_train_woe.columns]
        if self.df_test.any().any():
            self.df_test_woe = sc.woebin_ply(self.df_test, self.bins_adj)
            self.df_test_woe.columns = [i.replace("_woe", "") for i in self.df_test_woe.columns]
        if self.df_ott.any().any():
            self.df_ott_woe = sc.woebin_ply(self.df_ott, self.bins_adj)
            self.df_ott_woe.columns = [i.replace("_woe", "") for i in self.df_ott_woe.columns]
        self.log.info("CutBin has finished!")

Ejemplo n.º 4

0

Mostrar archivo

def num_bin(df:pd.DataFrame,cols:list=None,target:str='target',specials:list=None,
            bin_num_limit:int=5,count_distr_limit:float=0.05,sc_method='chimerge',
            non_mono_cols:list=None,init_bins=10,init_min_samples=0.05,init_method='chi',**kwargs):

    # 粗分箱,单调检验,分箱结果
    if not cols:
        cols = df.columns.difference([target]).tolist()

    if specials:
        specials = {k: specials for k in cols}

    if not non_mono_cols:
        non_mono_cols = []

    bind, ivd = dict(), dict()
    t0 = time.process_time()

    for col in cols:
        if col in non_mono_cols:
            bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, bin_num_limit=bin_num_limit,
                               count_distr_limit=count_distr_limit, method=sc_method,print_info=False)[col]
            ivd[col] = bind[col]['total_iv'].unique()[0]

        else:
            c = Combiner()
            c.fit(X=df[col], y=df[target],n_bins=init_bins,min_samples=init_min_samples,method=init_method,**kwargs)
            init_points = c.export()[col]
            breaks_list = monotonous_bin(df=df, col=col, target=target,cutOffPoints=init_points, special_values=specials)

            bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, breaks_list=breaks_list,
                               bin_num_limit=bin_num_limit,count_distr_limit=count_distr_limit,method=sc_method,
                               print_info=False)[col]
            ivd[col] = bind[col]['total_iv'].unique()[0]

    print(f'there are bing {len(cols)} using {int((time.process_time() - t0) * 100 / 60)} seconds')
    return bind, ivd

Ejemplo n.º 5

0

Mostrar archivo

    def woe_transform(self, train, test):
        # includes var filtering and one-hot encoding of 'INDUSTRY' column in all data
        train = sc.var_filter(train, 'DEFAULT_FLAG', var_kp='INDUSTRY')
        self.encode_categorical(train)
        bins = sc.woebin(train, 'DEFAULT_FLAG')
        train_woe = sc.woebin_ply(train, bins)
        train_columns = [
            'ACCESS_CREDIT', 'ASSESSMENT_YEAR', 'MEDIUM_TERM_LIQUIDITY',
            'OWNERS_MANAGEMENT', 'PRODUCT_DEMAND', 'PROFITABILITY',
            'SHORT_TERM_LIQUIDITY', 'TURNOVER', 'DEFAULT_FLAG', 'INDUSTRY'
        ]
        test_selected = test[train_columns]
        self.encode_categorical(test_selected)
        test_woe = sc.woebin_ply(test_selected, bins)

        return train_woe, test_woe

Ejemplo n.º 6

0

Mostrar archivo

def cat_bin(df:pd.DataFrame,cols:list=None,target:str='target',specials:list=None,
            bin_num_limit:int=5,count_distr_limit:float=0.05,method:str='chimerge',**kwargs):
    if not cols:
        cols = df.columns.difference([target]).tolist()

    if specials:
        specials = {k:specials for k in cols}

    bind, ivd = dict(), dict()
    t0 = time.process_time()
    for col in cols:
        bind[col] = woebin(dt=df,x=col,y=target,special_values=specials,bin_num_limit=bin_num_limit,
                           count_distr_limit=count_distr_limit,method=method,print_info=False,
                           **kwargs)[col]
        ivd[col] = bind[col]['total_iv'].unique()[0]
    print(f'there are bing {len(cols)} using {int((time.process_time() - t0) * 100 / 60)} seconds')
    return bind, ivd

Ejemplo n.º 7

0

Mostrar archivo

def make_bins(model_data: 'BinaryDependenceModelData', manual_breaks: dict = None) -> dict:
    """ Make bins for numeric variables of model_data, optimizing for Information Value
        based on created binary target

    Parameters
    ----------
    model_data
    manual_breaks
        Info about features that need to be separated into predefined intervals.
        If this argument is set,
            function won't calculate intervals for it and will use passed values as breaks instead.
        Format: {feature_name: [break1, break2]}

    Returns
    -------
    dict
        Info about bins, where keys are features, values are dataframes with data about bins
    """
    if not model_data.num_cols:
        warnings.warn('model_data.num_cols is not set')
        return dict()
    if model_data.is_data_converted:
        warnings.warn(
            "Features in model_data seem to be already converted to binary format, binning might be futile")
    dt = model_data.base_data[model_data.num_cols].join(model_data.data[model_data.y_binary_name])
    kwargs = {'dt': dt, 'y': model_data.y_binary_name}
    # TODO: manual breaks don't work exactly as expected. It there are no values in the interval,
    #  break would not be created
    if manual_breaks is not None and isinstance(manual_breaks, dict):
        kwargs['breaks_list'] = manual_breaks
    bins = sc.woebin(**kwargs)

    # Adjusting bins manually (rounding for representation)
    # TODO: add rounding but without rerunning binning;
    #  for now users can rerun binning with manual_breaks arg
    # breaks_adj = dict()
    # for c in bins.keys():
    #     for b in bins[c]['breaks'].tolist():
    #         if isinstance(b, str):
    #             continue
    #         breaks_adj[c] = round(float(b))
    # bins = sc.woebin(model_data.data[list(model_data.num_cols) + [model_data.y_binary_name]],
    #                  y=model_data.y_binary_name,
    #                  breaks_list=breaks_adj)
    return bins

Ejemplo n.º 8

0

Mostrar archivo

    def number_bin(self,bin_feature,max_num_bin=None,no_mono_feature=None):
        """
        有序数值变量分箱组合
        :param bin_feature:list, 参与分箱的变量
        :param max_num_bin: int 最大分箱数
        :param no_mono_feature list 不参与单调检验的变量
        :return: bin_dict:dict, var_iv:dict
        """
        t0 = time.process_time()
        bin_dict,var_iv = {},{}
        df = copy.deepcopy(self._df)
        bin_feature = IC.check_list(bin_feature)
        df[bin_feature] = df[bin_feature].astype("float") #防止误将分类变量当做连续变量处理,若不是, 调用该函数将报错

        if max_num_bin == None:
            max_num_bin = self._max_num_bin
        else:
            max_num_bin = IC.check_int(max_num_bin)
        if no_mono_feature == None:
            no_mono_feature = []
        else:
            no_mono_feature = IC.check_list(no_mono_feature)

        # 开始分箱
        for col in bin_feature:
            if isinstance(self._special_values,dict):
                special_values = self._special_values[col]
            else:
                special_values = self._special_values
            # 对于唯一值的变量进行跳过
            unique_values = [v for v in df[col].unique() if v not in special_values]
            if len(unique_values) ==1:
                warnings.warn("There are {} columns have only one unique values,{}, which are skipping from this bing.".format(col,unique_values))
                continue

            if col not in no_mono_feature:
                cutOffPoints = woebin(dt=df[[col,self._target]],y=self._target,x=col,breaks_list=self._breaks_list,special_values=special_values,
                                      min_perc_fine_bin=self._min_per_fine_bin,min_perc_coarse_bin=self._min_per_coarse_bin,stop_limit=self._stop_limit,
                                      max_num_bin=max_num_bin,method=self._method)[col]["breaks"].tolist()

                cutOffPoints = [float(i) for i in set(cutOffPoints) if str(i) not in ['inf','-inf']]  # 切分点
                cutOffPoints = sorted([i for i in cutOffPoints if i not in special_values])

                if not cutOffPoints: # 切分点为空
                    warnings.warn("There are zero cutOffPoint of {} columns from this bing, which select all unique values insert cutOffPoint".format(col))
                    cutOffPoints = sorted([i for i in df[col].unique() if i not in special_values])

                # 单调检验合并方案结果
                # mono_cutOffPoints:dict
                mono_cutOffPoints = monotonous_bin(df=df[[col,self._target]],col=col,cutOffPoints=cutOffPoints,
                                                   target=self._target,special_values=special_values)
            else:
                mono_cutOffPoints = {}

            # 最终方案
            bin_dict[col] = woebin(dt=df[[col,self._target]],y=self._target,x=col,breaks_list=mono_cutOffPoints,special_values=special_values,
                                      min_perc_fine_bin=self._min_per_fine_bin,min_perc_coarse_bin=self._min_per_coarse_bin,stop_limit=self._stop_limit,
                                      max_num_bin=max_num_bin,method=self._method)[col]
            # 保存IV
            var_iv[col] = bin_dict[col]["total_iv"].unique()[0]

        print("处理完{}个有序数值变量,耗时:{}秒".format(len(bin_feature), (time.process_time() - t0) * 100 / 60))
        return bin_dict, var_iv

Ejemplo n.º 9

0

Mostrar archivo

df = pd.read_csv("E:\GitHub\Credit-Scorecard-Project\Python\hmeq_clean.csv")

# df.drop(['VALUE'], axis=1, inplace=True)

# Apply Transformations
df.LOAN = np.log(df.LOAN)
# df.MORTDUE = np.log(df.MORTDUE)
# df.VALUE = np.log(df.VALUE)
# Variable contained zeros so added 1 year to every observation
df.YOJ = np.log(df.YOJ + 1)

# Drop REASON and MORTDUE
df.drop(['REASON', 'MORTDUE'], axis=1, inplace=True)

# Create WOE bins
bins = sc.woebin(df, 'BAD', method='chimerge')

# Job was not binning correctly, this fixed that
break_list = {'JOB': df.JOB.unique().tolist()}
job_bins = sc.woebin(df,
                     'BAD',
                     method='chimerge',
                     x=['JOB'],
                     breaks_list=break_list)
bins['JOB'] = job_bins['JOB']

# Plot WOE bins
# fig, axs = plt.subplots(ncols=2)
# sc.woebin_plot(bins, figsize=[8,5])

# Print results of binning

Ejemplo n.º 10

0

Mostrar archivo

Archivo: scorecardpy.py Proyecto: gengbh/PythonLearning

# -- coding: utf-8 --
# Traditional Credit Scoring Using Logistic Regression
import scorecardpy as sc

# data prepare ------
# load germancredit data
dat = sc.germancredit()

# filter variable via missing rate, iv, identical value rate
dt_s = sc.var_filter(dat, y="creditability")

# breaking dt into train and test
train, test = sc.split_df(dt_s, 'creditability').values()

# woe binning ------
bins = sc.woebin(dt_s, y="creditability")
# sc.woebin_plot(bins)

# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
    'age.in.years': [26, 35, 40],
    'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]
}
bins_adj = sc.woebin(dt_s, y="creditability", breaks_list=breaks_adj)

# converting train and test into woe values
train_woe = sc.woebin_ply(train, bins_adj)
test_woe = sc.woebin_ply(test, bins_adj)

Ejemplo n.º 11

0

Mostrar archivo

    data.drop(['SERVICES'], axis=1, inplace=True)
    #------------------ Grouping zone if there is not enough data------------------------
    Clases_UPZ = np.unique(data['ZONE'])
    Data_UPZ = data.groupby('ZONE').groups
    for i in Clases_UPZ:
        numero_clases = Data_UPZ[i]
        if len(numero_clases) < 10:
            data['ZONE'].loc[numero_clases] = 'ZONE_Other'

#-------------------- 1. OBTAINING BINS---------------------------------------
    train_b, test_b = sc.split_df(data, y='OUTCOME', ratio=0.7,
                                  seed=100).values()
    bins = sc.woebin(
        train_b,
        y='OUTCOME',
        min_perc_fine_bin=0.01,  # How many bins to cut initially into
        min_perc_coarse_bin=0.05,  # Minimum percentage per final bin
        stop_limit=0.2,  # Minimum information value
        max_num_bin=10,  # Maximum number of bins
        method='tree')

    #Transforming variables to dummies
    train, test, deleted_var = dummies_on(train_b, test_b, bins, continuous)
    #Defining Train Data and Test Data
    X_train = train[train.columns.difference(['OUTCOME'])]
    Y_train = train['OUTCOME']
    Y_train = Y_train.astype('int')
    X_test = test[test.columns.difference(['OUTCOME'])]
    Y_test = np.array(test['OUTCOME'], dtype=float)
    Name_columns = X_test.columns
    Name_columns = Name_columns.union(['number_deleted_variables'])
    COEF = pd.DataFrame(index=Name_columns)

Ejemplo n.º 12

0

Mostrar archivo

labelEncoder = LabelEncoder()
data['status'] = labelEncoder.fit_transform(data['status'].values)
data = data.astype('str')



#默认删除信息只<0.02，缺失率>95%，单类别比例>95%的变量
dt_s = sc.var_filter(data, y='status')
print('变量预处理前后变化：', data.shape, '->', dt_s.shape)
#print(data.columns)
#print(dt_s.columns)



#分箱WOE转换
bins = sc.woebin(dt_s, y='status')
# bins

train, test = sc.split_df(dt_s, 'status').values()
print('训练集、测试集划分比例为：', train.shape[0], ':', test.shape[0])

train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)
#train_woe.head()

y_train = train_woe.loc[:,'status']
X_train = train_woe.loc[:, train_woe.columns != 'status']
y_test = test_woe.loc[:, 'status']
X_test = test_woe.loc[:, train_woe.columns != 'status']

lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: covid_19_script.py Proyecto: Alex0896/Credit-Scorecard-Project

df.age = df.age.str.replace('90-99', '95')
df.age = df.age.str.replace('28-35', '31')
df.age = df.age.str.replace('80-', '80')

df.age = df.age.astype('float')
df.dropna(inplace=True)

# Reorder columns
cols = df.columns.tolist()
cols = cols[2:3] + cols[0:2] + cols[3:]

df = df[cols]

df.drop(['diabetes' ], axis=1, inplace=True)

bins = sc.woebin(df, 'outcome', method='chimerge')

cols = df.iloc[:, 2:].columns
break_list = {}
for col in cols:
    break_list[col] = [1.0]
    
bins.update(sc.woebin(df, 'outcome', method='chimerge', x=cols.tolist(), 
                      breaks_list=break_list))

# split into train and test set
train, test = sc.split_df(df, 'outcome').values()

# Convert values into woe
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)

Ejemplo n.º 14

0

Mostrar archivo

    def number_bin(self, bin_feature, max_num_bin=None, no_mono_feature=None):
        """
        有序数值变量分箱组合
        :param bin_feature:list, 参与分箱的变量
        :param max_num_bin: int 最大分箱数
        :param no_mono_feature list 不参与单调检验的变量
        :return: bin_dict:dict, var_iv:dict
        """
        t0 = time.process_time()
        bin_dict, var_iv = {}, {}
        df = self._df
        bin_feature = IC.check_list(bin_feature)
        df[bin_feature] = df[bin_feature].astype("float")  #防止误将分类变量当做连续变量处理

        if max_num_bin == None:
            max_num_bin = self._max_num_bin
        else:
            max_num_bin = IC.check_int(max_num_bin)
        if no_mono_feature == None:
            no_mono_feature = []
        else:
            no_mono_feature = IC.check_list(no_mono_feature)

        # 开始分箱
        for col in bin_feature:
            try:
                if col not in no_mono_feature:
                    cutOffPoints = woebin(
                        dt=df,
                        y=self._target,
                        x=col,
                        breaks_list=self._breaks_list,
                        special_values=self._special_values,
                        min_perc_fine_bin=self._min_per_fine_bin,
                        min_perc_coarse_bin=self._min_per_coarse_bin,
                        stop_limit=self._stop_limit,
                        max_num_bin=max_num_bin,
                        method=self._method)[col]["breaks"].tolist()
                    cutOffPoints = [
                        float(i) for i in cutOffPoints
                        if str(i) not in ['inf', '-inf']
                    ]  # 切分点

                    # 单调检验合并方案结果
                    mono_cutOffPoints = monotonous_bin(
                        df=self._df,
                        col=col,
                        cutOffPoints=cutOffPoints,
                        target=self._target,
                        special_values=self._special_values)
                else:
                    mono_cutOffPoints = None

                # 最终方案
                bin_dict[col] = woebin(
                    dt=self._df,
                    y=self._target,
                    x=col,
                    breaks_list=mono_cutOffPoints,
                    special_values=self._special_values,
                    min_perc_fine_bin=self._min_per_fine_bin,
                    min_perc_coarse_bin=self._min_per_coarse_bin,
                    stop_limit=self._stop_limit,
                    max_num_bin=max_num_bin,
                    method=self._method)[col]
                # 保存IV
                var_iv[col] = bin_dict[col]["total_iv"].unique()[0]

            except:
                print("异常变量 {} 无法通过单调性检验".format(col))
                # 再次分箱
                bin_dict[col] = woebin(
                    dt=self._df,
                    y=self._target,
                    x=col,
                    breaks_list=self._breaks_list,
                    special_values=self._special_values,
                    min_perc_fine_bin=self._min_per_fine_bin,
                    min_perc_coarse_bin=self._min_per_coarse_bin,
                    stop_limit=self._stop_limit,
                    max_num_bin=max_num_bin,
                    method=self._method)[col]
                print("变量{}的BadRate为{}".format(
                    col, bin_dict[col]['badprob'].tolist()))
                # 保存IV
                var_iv[col] = bin_dict[col]["total_iv"].unique()[0]

        print("处理完{}个有序数值变量,耗时:{}秒".format(
            len(bin_feature), (time.process_time() - t0) * 100 / 60))
        return bin_dict, var_iv

Ejemplo n.º 15

0

Mostrar archivo

 def __init__( self, X, y ):
     df = X.copy()
     df['target'] = y
     self.bins = sc.woebin(df, y='target')

Ejemplo n.º 16

0

Mostrar archivo

# Traditional Credit Scoring Using Logistic Regression
import scorecardpy as sc
import matplotlib.pyplot as plt

# data prepare ------
# load germancredit data
dat = sc.germancredit()

# filter variable via missing rate, iv, identical value rate
dt_s = sc.var_filter(dat, y="creditability")

# breaking dt into train and test
train, test = sc.split_df(dt_s, 'creditability').values()

# woe binning ------
bins = sc.woebin(dt_s, y="creditability")
print(type(bins))
for k, v in bins.items():
    print(k)

print(bins["purpose"])
print(bins["purpose"].columns)
print(type(bins["purpose"]))
# sc.woebin_plot(bins["purpose"])
# plt.show()

# =============================================================================
# print("qq: 1467288927")
# =============================================================================

Ejemplo n.º 17

0

Mostrar archivo

Archivo: score.py Proyecto: Silver-L/score_card

'''

import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
import scorecardpy as sc
import numpy as np

if __name__ == '__main__':
    train_data = pd.read_csv('./data/TrainData.csv')

    # 分箱(卡方 or tree)
    break_list = {'DebtRatio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1.7],
                  'NumberRealEstateLoansOrLines': [0, 1, 2, 3]}

    cutoff = sc.woebin(train_data, y='SeriousDlqin2yrs', method='chimerge', breaks_list=break_list)
    # print(cutoff["NumberOfTimes90DaysLate"]["woe"])

    feature_index = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
             'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
             'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']  # x轴的标签

    train_woe = sc.woebin_ply(train_data, cutoff)
    woe_index = ['{}_{}'.format(s, 'woe') for s in feature_index]
    woe_index.insert(0, 'SeriousDlqin2yrs')
    train_woe = train_woe.loc[:, woe_index]

    # test data
    test_data = pd.read_csv('./data/TestData.csv')
    test_woe = sc.woebin_ply(test_data, cutoff)
    test_woe = test_woe.loc[:, woe_index]