Esempio n. 1
0
    def cut_main(self):
        breaks_adj = {}
        for col in self.dist_col:
            if len([i for i in list(self.df_train[col].unique()) if i == i]) <= 1:
                self.log.info("{} has been deleted, because only has one single value".format(col))
                continue
            breaks_adj[col] = [i for i in list(self.df_train[col].unique()) if i == i]

        self.dist_col = list(breaks_adj.keys())
        cut_cols = self.dist_col + self.serial_col
        max_bins = self.max_bins
        self.woe_df = pd.DataFrame(columns=["variable", "bin", "woe", "bin_iv", "bad", "badprob"])
        self.bins_adj = {}
        while max_bins > 2 and len(cut_cols) > 0:
            self.log.info("现在是分{}箱, 有{}个变量需要分箱".format(max_bins, len(cut_cols)))
            if self.target_name not in cut_cols:
                cut_cols.append(self.target_name)
            bins_adj = sc.woebin(self.df_train[cut_cols], y=self.target_name, breaks_list=breaks_adj,
                                 bin_num_limit=max_bins, method=self.method)
            cut_cols = []
            for key, value in bins_adj.items():
                tmp = bins_adj[key].copy()
                if key in self.serial_col:
                    tmp1 = tmp[tmp.bin != "missing"]
                    if len(tmp1) == 1:
                        continue
                    else:
                        tmp1["min"] = tmp1.bin.map(
                            lambda x: float(x.split(",")[0].replace("[", "")) if x.find(",") > -1 else x)
                        tmp1 = tmp1.sort_values(by="min")
                        if not all(x < y for x, y in zip(tmp1.woe.tolist(), tmp1.woe.tolist()[1:])):
                            cut_cols.append(key)
                            continue
                self.woe_df = pd.concat([self.woe_df, tmp[["variable", "bin", "woe", "bin_iv", "bad", "badprob"]]])
            max_bins = max_bins - 1
            self.bins_adj = dict(self.bins_adj, **bins_adj)
        self.log.info("仍有{}个特征无法满足单调的需求,不能分箱".format(len(cut_cols)))
        self.woe_df.to_excel("{}_bin_result.xlsx".format(self.filename))
        self.log.info("*" * 50)
        self.log.info("WOE Detail can be checked in {}_bin_result.xlsx".format(self.filename))

        self.df_train_woe = sc.woebin_ply(self.df_train, self.bins_adj)
        self.df_train_woe.columns = [i.replace("_woe", "") for i in self.df_train_woe.columns]
        if self.df_test.any().any():
            self.df_test_woe = sc.woebin_ply(self.df_test, self.bins_adj)
            self.df_test_woe.columns = [i.replace("_woe", "") for i in self.df_test_woe.columns]
        if self.df_ott.any().any():
            self.df_ott_woe = sc.woebin_ply(self.df_ott, self.bins_adj)
            self.df_ott_woe.columns = [i.replace("_woe", "") for i in self.df_ott_woe.columns]
        self.log.info("CutBin has finished!")
Esempio n. 2
0
def woe_trans(df,bin_dict,trans_feature,target="target"):
    """
    woe 转换
    :param df: origin data 包含 目标变量
    :param bin_dict: dict
    :param trans_feature:list 转换变量
    :param target: str 目标变量
    :return: df_woe,new_feature
    """
    df = IC.check_df(df)
    bin_dict = IC.check_dict_of_df(bin_dict)
    trans_feature = IC.check_list(trans_feature)
    if not set(trans_feature).issubset(set(list(bin_dict.keys()))):
        print("bin_dict.key",bin_dict.keys(),'\n')
        print("trans_feature:",trans_feature)
        warnings.warn("trans_feature not in bin_dict.keys, Please double check feature set")
        raise SystemExit(0)

    dt = df[trans_feature+[target]]
    df_woe = woebin_ply(dt=dt,bins=bin_dict)
    new_feature = [i+"_woe" for i in trans_feature]

    if not set(new_feature).issubset(set(df_woe.columns.difference([target]).tolist())):
        warnings.warn("new feature not in df_woe.columns, Please double check feature set")
        raise SystemExit(0)

    return df_woe,new_feature
Esempio n. 3
0
    def woe_transform(self, train, test):
        # includes var filtering and one-hot encoding of 'INDUSTRY' column in all data
        train = sc.var_filter(train, 'DEFAULT_FLAG', var_kp='INDUSTRY')
        self.encode_categorical(train)
        bins = sc.woebin(train, 'DEFAULT_FLAG')
        train_woe = sc.woebin_ply(train, bins)
        train_columns = [
            'ACCESS_CREDIT', 'ASSESSMENT_YEAR', 'MEDIUM_TERM_LIQUIDITY',
            'OWNERS_MANAGEMENT', 'PRODUCT_DEMAND', 'PROFITABILITY',
            'SHORT_TERM_LIQUIDITY', 'TURNOVER', 'DEFAULT_FLAG', 'INDUSTRY'
        ]
        test_selected = test[train_columns]
        self.encode_categorical(test_selected)
        test_woe = sc.woebin_ply(test_selected, bins)

        return train_woe, test_woe
Esempio n. 4
0
num_bin, num_iv = rs.num_bin(df=train,
                             cols=num_cols,
                             target='target',
                             specials=[-99999],
                             bin_num_limit=8,
                             count_distr_limit=0.05,
                             sc_method='chimerge',
                             non_mono_cols=['age_in_years'],
                             init_bins=15,
                             init_min_samples=0.05,
                             init_method='chi')

bins = {**cat_bin, **num_bin}
ivs = {**cat_iv, **num_iv}
# woe转换
train_woe = sc.woebin_ply(dt=train, bins=bins)
valid_woe = sc.woebin_ply(dt=valid, bins=bins)
df_woe = sc.woebin_ply(dt=df, bins=bins)

#########################################
# 3.特征选择
#########################################
sf = rs.SelectFeature()
high_iv = sf.baseOn_iv(ivd=ivs, thred=0.05, is_draw=False)
low_vif = sf.baseOn_collinear(df=train_woe,
                              high_iv=high_iv,
                              thred=0.7,
                              is_draw=False)
ml_cols, best_C = sf.baseOn_l1(X=train_woe[low_vif.keys()],
                               y=train_woe['target'],
                               Kfold=5,
Esempio n. 5
0
 def transform( self, X):
     woe = sc.woebin_ply(X, self.bins)
     return woe
Esempio n. 6
0
                     breaks_list=break_list)
bins['JOB'] = job_bins['JOB']

# Plot WOE bins
# fig, axs = plt.subplots(ncols=2)
# sc.woebin_plot(bins, figsize=[8,5])

# Print results of binning
# for k, bin_ in bins.items():
#     print(bins[k].iloc[:,0:-2].round(2).to_latex(index=False))

# split into train and test set
train, test = sc.split_df(df, 'BAD').values()

# Convert values into woe
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)

# Add constant
train_woe = sm.add_constant(train_woe)
test_woe = sm.add_constant(test_woe)

y_train = train_woe.loc[:, 'BAD']
X_train = train_woe.loc[:, train_woe.columns != 'BAD']
y_test = test_woe.loc[:, 'BAD']
X_test = test_woe.loc[:, train_woe.columns != 'BAD']

# Fit logit model
lr = sm.GLM(y_train, X_train, family=sm.families.Binomial())
fit = lr.fit()
Esempio n. 7
0
if __name__ == '__main__':
    train_data = pd.read_csv('./data/TrainData.csv')

    # 分箱(卡方 or tree)
    break_list = {'DebtRatio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1.7],
                  'NumberRealEstateLoansOrLines': [0, 1, 2, 3]}

    cutoff = sc.woebin(train_data, y='SeriousDlqin2yrs', method='chimerge', breaks_list=break_list)
    # print(cutoff["NumberOfTimes90DaysLate"]["woe"])

    feature_index = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
             'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
             'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']  # x轴的标签

    train_woe = sc.woebin_ply(train_data, cutoff)
    woe_index = ['{}_{}'.format(s, 'woe') for s in feature_index]
    woe_index.insert(0, 'SeriousDlqin2yrs')
    train_woe = train_woe.loc[:, woe_index]

    # test data
    test_data = pd.read_csv('./data/TestData.csv')
    test_woe = sc.woebin_ply(test_data, cutoff)
    test_woe = test_woe.loc[:, woe_index]

    # train lr
    x_train = train_woe.loc[:, train_woe.columns != 'SeriousDlqin2yrs']
    y_train = train_woe.loc[:, 'SeriousDlqin2yrs']

    x_train = x_train.drop(
        ['DebtRatio_woe', 'MonthlyIncome_woe', 'NumberOfOpenCreditLinesAndLoans_woe',
Esempio n. 8
0
def execute_data():
    start = time.time()
    try:
        global modelPath, binsPath, bins, model
        logger.info('入参:%s', str(request.get_data()))
        request_data = request.get_json()  #获取传入数据
        paramsJson = request_data['paramsData']
        modelFilePath = modelFilePathCheck(request_data)

        if (type(paramsJson) == type({})):

            #原始数据中数值型字符串转为int64
            items = paramsJson.items()
            for key, value in items:
                if (is_number(value)):
                    paramsJson[key] = int(value)
            logger.info("paramsJson:%s", paramsJson)
            #构建dataFrame
            df = pd.json_normalize(paramsJson)

            if (modelPath != modelFilePath):
                logger.info('调用模型路径发生变化,重新加载模型!')
                logger.info('global modelFilePath:%s', modelPath)
                logger.info('param modelFilePath:%s', modelFilePath)
                modelPath = modelFilePath
                #导入模型
                model = joblib.load(modelFilePath)
            else:
                logger.info('调用模型路径未发生变化,使用缓存中模型。')
                logger.info('global modelFilePath:%s', modelPath)

            #原始数据转换为woe值
            bins = model.bins
            df_woe = sc.woebin_ply(df, bins)
            #打标签
            label = model.predict(df_woe)[0]

            #构建评分卡
            card = sc.scorecard(bins, model, df_woe.keys())
            #评分
            score = sc.scorecard_ply(df,
                                     card,
                                     only_total_score=False,
                                     print_step=0)
            #计算每个特征的得分
            featureScore = {}
            # featureScore = calculateFeatures(df, card)
            if isinstance(card, dict):
                card_df = pd.concat(card, ignore_index=True)
            elif isinstance(card, pd.DataFrame):
                card_df = card.copy(deep=True)
            # x variables
            xs = card_df.loc[card_df.variable != 'basepoints',
                             'variable'].unique()
            for i in xs:
                featureScore[i] = score[i + '_points'][0]

            result = {}
            result['code'] = '00000'
            result['score'] = str(score['score'][0])
            result['label'] = str(label)
            result['featureScore'] = featureScore
            end = time.time()
            logger.info("运行结果:%s,模型执行耗时:%s", result, end - start)
            return jsonify(result)

        code10002['errorMsg'] = '输入值错误:请传入json格式参数'
        return jsonify(code10002)

    except KeyError as e:
        logger.info(e)
        code10001['errorMsg'] = '输入特征错误:' + str(e)
        return jsonify(code10001)

    except ValueError as e:
        logger.info(e)
        code10002['errorMsg'] = '输入值错误:' + str(e)
        return jsonify(code10002)

    except Exception as e:
        logger.info(e)
        code10003['errorMsg'] = '未知错误:' + str(e)
        return jsonify(code10003)