def convert_scrd2rebin(score_card):
    '''
    该函数将已经生成的线上部署文档的2_评分卡(logistics和xgb通用)转换为rebin_spec过程,方便进行woe转换
    配合
    bin_obj = mt.BinWoe();
    X_cat_train = bin_obj.convert_to_category(data[selected], var_dict, rebin_spec)
    woe_iv_df_coarse = bin_obj.calculate_woe_all(X_cat_train, y_train, var_dict, rebin_spec)
    可以将离线数据进行分箱统计,包括逾期率以及分布
    Args:

    score_card(DataFrame):评分卡

    Returns:
    rebin_spec
    '''
    select_numerical = list(score_card[(score_card['中间层指标名称'].notnull())\
                                        &(score_card['数据类型']!='varchar')]['中间层指标名称'].unique())
    select_categorical = list(score_card[(score_card['中间层指标名称'].notnull())\
                                        &(score_card['数据类型']=='varchar')]['中间层指标名称'].unique())
    
    rebin_spec = {}
    
    if len(select_numerical)>=1:
        for i in select_numerical:
            rebin_spec[i] = mt.BinWoe().obtain_boundaries(score_card[score_card['中间层指标名称'] == i].分箱)

    if len(select_categorical)>=1:            
        for i in select_categorical:
            cat_dict = {}
            for j,k in enumerate(score_card[score_card['中间层指标名称']==i].分箱):
                cat_dict[j+1] = k.split(',')
            rebin_spec[i] = cat_dict
        
    return rebin_spec
Beispiel #2
0
 def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15):
     X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df)
     agglo = FeatureAgglomeration(n_clusters=n_clusters)
     if len(X_transformed) > 20000:
         X_agglo = X_transformed.sample(20000)
     else:
         X_agglo = X_transformed.copy()
     agglo.fit(X_agglo)
     vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(),
                                        'cluster':list(agglo.labels_)})\
                       .sort_values('cluster')
     return vars_clusters, X_transformed
Beispiel #3
0
def get_decile(score,
               varscore,
               all_Y,
               y_col,
               label,
               RESULT_PATH,
               manual_cut_bounds=[]):
    """
    根据提供的score和Y相应的Y定义

    Args:
    score (pd.DataFrame): index is apply_id
    all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build'
        name can be changed. index is apply_id
    y_col (str): column name of Y
    label (str): sample set label

    """
    merged_data = score.to_frame('score')\
                       .merge(all_Y, left_index=True, right_index=True)
    merged_data = merged_data.loc[merged_data[y_col].notnull()]
    if len(merged_data) > 10:
        ks_decile = mt.Performance().calculate_ks_by_decile(merged_data.score, \
                                    np.array(merged_data[y_col]), 'decile', 10,
                                    manual_cut_bounds=manual_cut_bounds)
        ks_decile.loc[:, 'sample_set'] = label
    else:
        ks_decile = pd.DataFrame()

    varscore_new = varscore.merge(score.to_frame('score'),
                                  left_index=True,
                                  right_index=True)

    if not ks_decile.empty:
        if len(manual_cut_bounds) == 0:
            manual_cut_bounds = mt.BinWoe().obtain_boundaries(
                ks_decile['分箱'])['cut_boundaries']

        varscore_new.loc[:,
                         'score_10bin'] = pd.cut(varscore_new.score,
                                                 manual_cut_bounds).astype(str)
        bin_varscore_avg = varscore_new.groupby('score_10bin').mean()\
                                       .reset_index()\
                                       .rename(columns={'score_10bin': '分箱'})
        ks_decile = ks_decile.merge(bin_varscore_avg, on='分箱')
        return ks_decile, manual_cut_bounds
    else:
        return pd.DataFrame(), manual_cut_bounds
Beispiel #4
0
def approval_rate_anaysis(backscore, RESULT_PATH, model_label, y_col):
    all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx'))
    the_bool = ((all_decile_df.sample_set == 'RUNBOOK')
                & (all_decile_df.model_label == model_label)
                & (all_decile_df.Y_definition == y_col))
    runbook = all_decile_df.loc[the_bool].copy()
    point_bounds = mt.BinWoe().obtain_boundaries(
        runbook['分箱'])['cut_boundaries']
    backscore_cut = pd.cut(backscore, point_bounds).astype(str)
    dist_ct = backscore_cut.value_counts().to_frame('样本量').sort_index().reset_index()\
                           .rename(columns={'index': '分箱'})
    dist_ct.loc[:, '分布占比'] = dist_ct['样本量'] / dist_ct['样本量'].sum()
    dist_ct = dist_ct.reset_index().rename(columns={'index': 'bin_order'})
    dist_ct = dist_ct.merge(runbook[['分箱', '逾期率']], on='分箱', how='left')
    dist_ct = dist_ct.sort_values('bin_order', ascending=False)
    dist_ct.loc[:, 'ApprovalRate'] = dist_ct['分布占比'].cumsum()
    dist_ct.loc[:, 'ApprovedBadRate'] = (
        (dist_ct['分布占比'] * dist_ct['逾期率']).cumsum() / dist_ct['分布占比'].cumsum())
    dist_ct.loc[:, 'model_label'] = model_label
    return dist_ct
Beispiel #5
0
def score_stability(score_dict, all_Y, y_col, train_set_name, RESULT_PATH,
                    model_label):
    """
    计算所有sample_set的score stability
    Args:
    score_dict (dict): the key is the label of the sample set. e.g. 'train',
        'test', 'oot' etc. should be the save as data_cat_dict.keys()
        The value is the scores. index is apply_id
    all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build'
        name can be changed. index is apply_id
    y_col (str): column name of Y
    """
    all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx'))
    the_bool = ((all_decile_df.model_label == model_label)
                & (all_decile_df.sample_set == train_set_name)
                & (all_decile_df.Y_definition == y_col))
    train_decile = all_decile_df.loc[the_bool].copy()
    point_bounds = mt.BinWoe().obtain_boundaries(
        train_decile['分箱'])['cut_boundaries']

    d_list = []
    compare_set_names = [
        i for i in list(score_dict.keys()) if i != train_set_name
    ]
    for label in compare_set_names:
        logging.log(18, label + ' data set starts score PSI calculation')
        tmp = mt.Performance().score_psi(score_dict[train_set_name]['score'],
                                         score_dict[label]['score'],
                                         point_bounds)
        columns_order = tmp.columns
        tmp.loc[:, 'compare_set'] = '%s_vs_%s' % (train_set_name, label)
        d_list.append(tmp)

    columns_order = ['model_label', 'compare_set'] + list(tmp.columns.values)
    all_score_psi = pd.concat(d_list)
    all_score_psi.loc[:, 'model_label'] = model_label
    return all_score_psi[columns_order]
Beispiel #6
0
def generate_xgb_deployment_documents(model_label,
                                      eda_table,
                                      model_spec,
                                      xgb_importance_score,
                                      var_dict,
                                      model_decile,
                                      model_result,
                                      RESULT_PATH,
                                      test_case_data,
                                      test_nums,
                                      woe_iv_df_coarse,
                                      production_name_map={}):
    """
    生成XGBoost模型部署文档

    Args
    model_label (str): 模型名称
    eda_table (dataFrame): EDA结果
    model_spec (dataFrame): XGBoost输出数据和分箱明细.pkl 文件
    xgb_importance_score (dataFrame): xgboost模型变量重要性排序
    var_dict (dataFrame): 数据字典
    model_decile(dataFrame): 建模decile
    model_result (dict): 模型结果
    RESULT_PATH (str): 结果路径
    test_case_data(dataFrame): 生成testcase的测试数据, 如果用于部署, 请确保apply_id为index
    test_nums(int):生成测试用例的数量
    woe_iv_df_coarse(dataFrame): 建模粗分箱结果
    production_name_map (dict): 当有指标英文线上的名字和线下建模时不一样时传入。key=建模时英文名
        value=线上英文名。 default={} 默认建模时和线上没有命名不一致的情况
    """
    if not os.path.exists(os.path.join(RESULT_PATH, 'deployment')):
        os.makedirs(os.path.join(RESULT_PATH, 'deployment'))

    DEPLOY_PATH = os.path.join(RESULT_PATH, 'deployment')

    eda_table = eda_table
    model_spec = model_spec
    rebin_spec = model_spec['rebin_spec']
    bin_to_label = model_spec['bin_to_label']
    dummy_var_name_map = model_spec['dummy_var_name_map']

    impt_writer = pd.ExcelWriter(
        os.path.join(DEPLOY_PATH, '%s_variable_importance.xlsx' % model_label))
    xgb_importance_score.to_excel(impt_writer, 'local', index=False)
    impt_writer.save()

    xgb_importance_score = xgb_importance_score
    xgb_importance_score = xgb_importance_score.rename(
        columns={
            'feature': 'XGB衍生入模名称',
            'fscore': '指标用于Split数据的数量',
            'imp_pct': '指标用于Split数据的数量占比'
        })

    xgb_importance_score[['XGB变量转换类型', '中间层指标名称']] = xgb_importance_score['XGB衍生入模名称']\
                        .apply(lambda x: pd.Series(mt.BinWoe().xgboost_obtain_raw_variable(x, var_dict)))

    xgb_importance_score['建模时指标名称'] = xgb_importance_score['中间层指标名称'].copy()
    xgb_importance_score['建模时XGB衍生入模名称'] = xgb_importance_score[
        'XGB衍生入模名称'].copy()
    for original, new_name in production_name_map.items():
        a = xgb_importance_score.loc[xgb_importance_score['建模时指标名称']==original, '建模时XGB衍生入模名称']\
                                .apply(lambda x: x.replace(original, new_name))
        xgb_importance_score.loc[xgb_importance_score['建模时指标名称'] == original,
                                 'XGB衍生入模名称'] = a
        if original in rebin_spec:
            rebin_spec[new_name] = rebin_spec.pop(original)
        if original in bin_to_label:
            bin_to_label[new_name] = bin_to_label.pop(original)
        # 这一步很重要,要将字典里边的变量名也改掉,不然rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables+bin_variables)会出错
        if original in list(var_dict['指标英文']):
            var_dict['指标英文'].replace({original: new_name}, inplace=True)

    xgb_importance_score['中间层指标名称'] = xgb_importance_score['中间层指标名称'].replace(
        production_name_map)

    xgb_importance_score = var_dict[['数据源', '指标英文', '指标中文', '数据类型']]\
                                .rename(columns={'指标英文':'中间层指标名称'})\
                                .merge(xgb_importance_score, on='中间层指标名称', how='right')

    xgb_importance_score.insert(
        5, '输出打分指标名称', xgb_importance_score['XGB衍生入模名称'].apply(
            lambda x: 'mlr_' + str(x) + '_xgb_' + model_label))

    xgb_importance_score = xgb_importance_score.append(
        {'输出打分指标名称': 'mlr_creditscore_xgb_' + model_label}, ignore_index=True)
    xgb_importance_score = xgb_importance_score.append(
        {'输出打分指标名称': 'mlr_prob_xgb_' + model_label}, ignore_index=True)

    model_decile = model_decile

    selected_variables = xgb_importance_score['建模时指标名称'].unique()
    model_eda = eda_table.loc[eda_table['指标英文'].isin(
        selected_variables)].copy()
    model_eda['指标英文'] = model_eda['指标英文'].replace(production_name_map)

    woe_iv_df_coarse_copy = woe_iv_df_coarse.copy()
    if len(production_name_map) > 0:
        for j, k in production_name_map.items():
            woe_iv_df_coarse_copy = woe_iv_df_coarse_copy.replace(j, k)

    xgb_importance_score.rename(columns={'数据源': '指标分类'}, inplace=True)
    xgb_importance_score_bin_result = pd.merge(xgb_importance_score.drop_duplicates('中间层指标名称')[['指标分类','中间层指标名称','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\
     ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称']]\
         ,woe_iv_df_coarse_copy,left_on='中间层指标名称',right_on='指标英文',how='left')\
    [['指标分类','中间层指标名称','指标中文','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\
     ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称'\
    ,'数据类型','指标类型','分箱','分箱对应原始分类','N','分布占比','WOE','逾期率']]

    xgb_importance_score_bin_result[
        '分箱'] = xgb_importance_score_bin_result.apply(
            lambda x: x['分箱对应原始分类'] if x['分箱对应原始分类'] else x['分箱'], axis=1)
    xgb_importance_score_bin_result.drop(['分箱对应原始分类'], axis=1, inplace=True)

    writer = pd.ExcelWriter(
        os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label))
    xgb_importance_score_bin_result.to_excel(writer,
                                             '2_模型变量重要性排序及分箱统计',
                                             index=False)
    model_decile.to_excel(writer, '3_模型decile', index=False)
    model_eda.to_excel(writer, '4_模型EDA', index=False)
    writer.save()
    logging.info("""第六步部署:生成XGBoost部署文档。
    1. 模型部署Excel文档存储于:%s
    2. 需添加『0_文档修订记录』、『1_信息总览』页面。详见其他正式部署文档文件。并存储于『/Seafile/模型共享/模型部署文档/』相应文件夹中
    """ % os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label))

    model_result = model_result
    derive_name_map = dict(
        zip(xgb_importance_score['建模时XGB衍生入模名称'],
            xgb_importance_score['XGB衍生入模名称']))
    xgbmodel = model_result['model_final']

    var_list = []
    for i in xgbmodel.__dict__['feature_names']:
        try:
            var_list.append(derive_name_map[i])
        except:
            var_list.append(i)
    xgbmodel.__dict__['feature_names'] = var_list

    if len(production_name_map) > 0:
        var_list_copy = []
        for i in var_list:
            for j, k in production_name_map.items():
                r = i.replace(j, k)
            var_list_copy.append(r)
        xgbmodel.__dict__['feature_names'] = var_list_copy

    num_variables = list(xgb_importance_score.loc[
        xgb_importance_score['XGB变量转换类型'] == 'num_vars_origin',
        '中间层指标名称'].unique())
    bin_variables = list(xgb_importance_score.loc[
        xgb_importance_score['XGB变量转换类型'] == 'bin_vars', '中间层指标名称'].unique())
    rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict,
                                            num_variables + bin_variables)
    bin_to_label = {
        k: v
        for k, v in bin_to_label.items() if k in bin_variables
    }

    var_transform_method = {}
    var_transform_method['num_vars_origin'] = num_variables
    var_transform_method['bin_vars'] = bin_variables
    var_transform_method['dummy_vars'] = {}
    dummy_vars_df = xgb_importance_score.loc[xgb_importance_score['XGB变量转换类型']
                                             == 'dummy_vars'].copy()
    dummy_vars_df.loc[:, "dummy原始名称"] = dummy_vars_df['建模时XGB衍生入模名称'].apply(
        lambda x: dummy_var_name_map[x])
    dummy_vars_df.loc[:,
                      'dummy原始对应分类'] = dummy_vars_df.loc[:, "dummy原始名称"].apply(
                          lambda x: x.split('DUMMY')[-1])
    for original_variable in dummy_vars_df['中间层指标名称'].unique():
        cat_list = list(
            dummy_vars_df.loc[dummy_vars_df['中间层指标名称'] == original_variable,
                              'dummy原始对应分类'].unique())
        var_transform_method['dummy_vars'][original_variable] = cat_list

    production_dummy_var_name_map = {}
    for new_dummy_name, old_dummy_name in dummy_var_name_map.items():
        if new_dummy_name != old_dummy_name:
            if new_dummy_name.split('DUMMY')[0] in production_name_map:
                var_name = new_dummy_name.split('DUMMY')[0]
                prod_var_name = production_name_map[var_name]
                production_dummy_var_name_map[old_dummy_name.replace(
                    var_name, prod_var_name)] = new_dummy_name.replace(
                        var_name, prod_var_name)
            else:
                production_dummy_var_name_map[old_dummy_name] = new_dummy_name

    var_transform_method['dummy_var_name_map'] = production_dummy_var_name_map

    save_data_to_json(rebin_spec_json, DEPLOY_PATH,
                      '%s_selected_rebin_spec.json' % model_label)
    save_data_to_json(bin_to_label, DEPLOY_PATH,
                      '%s_bin_to_label.json' % model_label)
    save_data_to_python2_pickle(xgbmodel, DEPLOY_PATH,
                                '%s_xgbmodel.pkl' % model_label)
    save_data_to_json(var_transform_method, DEPLOY_PATH,
                      '%s_var_transform_method.json' % model_label)
    ''' 生成模型的testcase '''
    if test_nums < len(test_case_data):
        test_case_data_final = test_case_data[:test_nums]
    else:
        test_case_data_final = test_case_data.copy()
    if len(production_name_map) > 0:
        columns = []
        for i in test_case_data_final.columns:
            for j, k in production_name_map.items():
                r = i.replace(j, k)
            columns.append(r)
        test_case_data_final.columns = columns

    offline_model_score = pd.concat([model_result['p_train'].apply(mt.Performance()\
                                  .p_to_score).to_frame('offline_model_score'),\
                                    model_result['p_test'].apply(mt.Performance()\
                                  .p_to_score).to_frame('offline_model_score')])

    model_used_origin_features = [
        i for i in list(set(xgb_importance_score['中间层指标名称']))
        if str(i) != 'nan' and i != None
    ]
    test_case_data_final_with_xgbScore = pd.merge(
        test_case_data_final[model_used_origin_features],
        offline_model_score,
        left_index=True,
        right_index=True)

    def get_xgb_testcase(test_data, modelName, productName):
        strs = ''' curl -d 'varparams={"modelName":"%s","productName":"%s","applyId": 123, ''' % (
            modelName, productName)
        for i, j in test_data.items():
            strs = strs + '"%s"' % i + ':"%s"' % j + ', '
        final = strs[:-2] + '''}' -X POST http://localhost:8080/modelInvoke'''
        return final

    test_case_data_final_with_xgbScore[
        'test_case'] = test_case_data_final_with_xgbScore[
            test_case_data_final_with_xgbScore.columns.difference(
                ['offline_model_score'])].to_dict('records')
    test_case_data_final_with_xgbScore[
        'curl_script'] = test_case_data_final_with_xgbScore['test_case'].apply(
            get_xgb_testcase, args=(model_label, model_label))
    test_case_data_final_with_xgbScore[[
        'offline_model_score', 'curl_script'
    ]].to_csv(os.path.join(DEPLOY_PATH,
                           '%s_xgbmodel_testcase.csv' % model_label),
              index=None)
    '''为秒算模型部署平台生成测试用例'''
    generate_xgb_testcases(model_label, test_case_data_final, var_dict,
                           DEPLOY_PATH, model_result)

    logging.info("""第六步部署:生成XGBoost部署文档。
    线上部署配置文件存储于%s路径下
    1. %save_data_dict_to_pickle
    2. %s
    3. %s
    4. %s
    """ % (
        DEPLOY_PATH,
        '%s_selected_rebin_spec.json' % model_label,
        '%sbin_to_label.json' % model_label,
        '%s_var_transform_method.json' % model_label,
        '%s_xgbmodel.pkl' % model_label,
    ))
Beispiel #7
0
# 相关性分析删除的变量
exclusion_cols = vif_result.corr_exclude_vars + vif_result.vif_dropvars
exclusion_cols

# # 调整粗分箱
selected = list(set(selected) - set(exclusion_cols))
selected

################################################################
#S9: 适用于逻辑回归的调分箱, 衍生bin_num, dummy变量的处理方式
################################################################

# ## 训练集按照新的rebin_spec进行分箱
#apply new bin cutting to the data
bin_obj = mt.BinWoe()
new_X = X_train[selected]
X_cat_train = bin_obj.convert_to_category(new_X, var_dict, rebin_spec)
woe_iv_df_coarse = bin_obj.calculate_woe_all(X_cat_train, y_train, var_dict,
                                             rebin_spec)
# 字符型变量编码前的原始值写入粗分箱文件中
mt.add_origin_categorical_val(woe_iv_df_coarse, all_encoding_map)
# ##将woe_iv_df_coarse存储出去,为了部署监控使用
save_data_to_pickle(woe_iv_df_coarse, Result_path, 'woe_iv_df_coarse.pkl')

# ## 测试集按照新的rebin_spec进行分箱
new_X = X_test[selected]
X_cat_test = bin_obj.convert_to_category(new_X, var_dict, rebin_spec)

# # 按月查看训练集分箱以后的分布以及逾期率
# ## 将分好箱的数据与原数据中的y以及时间标识合并
def xgb_online_score_validity(data,input_vars,var_dict,var_transform_method,model_booster,model_decile,onLineScoreName):
    '''
    该函数为xgb模型上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数

    Args:

    data(DataFrame):其中表头必须含有apply_id,var_code,create_at字段,
    大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id
    input_vars(list):原始变量
    var_dict(DataFrame):数据字典
    var_transform_method(json):部署时刻的变量转换文件
    model_booster(booster):模型部署时刻的booster文件
    model_decile(DataFrame):model建模时候的decile
    onLineScoreName:xgb模型线上打分字段名

    Returns:
    data_input_vars_missing(DataFrame):未填充缺失值的所有apply
    data_scored_applys_wrong(DataFrame):打分错误明细
    model_decile_summary(DataFrame):模型分PSI
    '''
    data = data.sort_values(['apply_id','var_code','created_at'],ascending=False)\
    .drop_duplicates(['apply_id','var_code'])

    # 将数据进行列行转换
    data_pivot_all = data.pivot(index='apply_id',columns='var_code',values='var_val')

    # 变量出现空值的apply统计
    data_input_vars_missing = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)>0]
    data_scored_applys = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)==0]

    # 进模型前的原始数据
    data_scored_applys_origin = data_scored_applys[input_vars]
    # 将数据转化为字典中的格式
    data_scored_applys_origin = mu.convert_right_data_type(data_scored_applys_origin, var_dict)[0]

    try:
        # 将需要做dummy的变量进行dummy转化
        data_dummys = pd.get_dummies(data_scored_applys_origin[list(var_transform_method['dummy_vars'].keys())]\
                                 ,columns=list(var_transform_method['dummy_vars'].keys()), drop_first=False\
                                 , prefix_sep='DUMMY')
        # 进入模型的数据
        data_in_model = pd.merge(data_scored_applys_origin,data_dummys,left_index=True,right_index=True)
    except:
        data_in_model = data_scored_applys_origin
    data_in_model.rename(columns=var_transform_method['dummy_var_name_map'],inplace=True)

    # 转换成xgb格式
    data_in_model_DMatrix = xgb.DMatrix(data_in_model[model_booster.__dict__['feature_names']])

    data_scored_applys['offLine_prob'] = model_booster.predict(data_in_model_DMatrix)
    data_scored_applys['offLineScore'] = data_scored_applys['offLine_prob']\
    .apply(lambda x : mt.Performance().p_to_score(x))

    # 打分错误的applyid
    data_scored_applys_wrong = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\
    .apply(lambda x : float(x[onLineScoreName]) != float(x['offLineScore']),axis=1)]

    data_scored_applys_right = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\
    .apply(lambda x : float(x[onLineScoreName]) == float(x['offLineScore']),axis=1)]

    # PSI统计
    point_bounds_XGBRandom = mt.BinWoe().obtain_boundaries(model_decile['分箱'])['cut_boundaries']
    model_decile_curr = pd.cut(data_scored_applys_right[onLineScoreName].astype(float)\
       ,point_bounds_XGBRandom).value_counts().sort_index().to_frame('curr_num').reset_index()

    model_decile_final = pd.concat([model_decile,model_decile_curr],axis=1)
    model_decile_final['建模分布'] = model_decile_final['样本数']/model_decile_final['样本数'].sum()
    model_decile_final['curr_dist'] = model_decile_final['curr_num']/model_decile_final['curr_num'].sum()
    model_decile_final['psi'] = model_decile_final.apply(lambda x : (x['建模分布'] - x['curr_dist'])\
                           *log(x['建模分布']/x['curr_dist'])\
                           if x['curr_dist']!=0 and x['建模分布']!=0 else 0\
                           ,axis=1)
    model_decile_summary = model_decile_final[['分箱','样本数','建模分布','curr_num','curr_dist','psi']]
    return data_input_vars_missing,data_scored_applys_wrong,model_decile_summary
def verify_var_and_score_validity(data_origin,score_card_deploy,rebin_spec_bin_adjusted_deploy,model_decile_deploy):
    '''
    该函数为上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数

    Args:

    data_origin(DataFrame):其中表头必须含有apply_id,var_code,create_at字段,
    大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id
    score_card_deploy(DataFrame):切记这是部署时刻的评分卡
    rebin_spec_bin_adjusted_deploy(DataFrame):部署时刻的变量分箱.json文件
    model_decile_deploy(DataFrame):部署时刻的model decile就在部署的评分卡中

    Returns:
    wrong_data(DataFrame):未填充缺失值的所有apply
    wrong_score_vars(dict):变量以及其对应的打分错误明细
    wrong_model_score(DataFrame):模型分打分错误明细
    var_dist_data_with_score_card(DataFrame):变量PSI
    score_dist_with_decile(DataFrame):模型分PSI
    '''

    # 将部署的评分卡切换成和建模时对应的字段名
    score_card_deploy.rename(columns={'打分':'变量打分','中间层指标名称':'指标英文'},inplace=True)
    # 选出模型中用到的字段名和其对应的
    model_selected = [i for i in set(list(score_card_deploy['指标英文'])) if str(i)!='nan']
    mlr_selected_vars =  [i for i in set(list(score_card_deploy['输出打分指标名称'])) if str(i)!='nan']

    # 生成数据字典
    var_dict = score_card_deploy.drop_duplicates('指标英文')
    var_dict = var_dict[var_dict['指标英文'].isin(model_selected)]

    # 生成打分时候建模的评分卡
    score_card_modeling = score_card_deploy[score_card_deploy['指标英文'].isin(model_selected)]
    score_card_modeling.ix[len(score_card_modeling)+1,'指标英文'] = 'intercept'
    const_var = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if 'const' in i][0]
    const_value = float(score_card_deploy.ix[score_card_deploy['输出打分指标名称']==const_var,'变量打分'])
    score_card_modeling.ix[score_card_modeling['指标英文']=='intercept','变量打分'] = const_value
    # 生成建模时刻的rebin_spec
    rebin_spec_bin_adjusted_modeling = rebin_spec_bin_adjusted_deploy['numerical']
    rebin_spec_bin_adjusted_modeling.update(rebin_spec_bin_adjusted_deploy['categorical'])

    # 生成变量名称与打分名称对应关系
    modelVar_scoreVar_dict = {}
    for j,i in score_card_deploy[['指标英文','输出打分指标名称']].drop_duplicates(['指标英文']).iterrows():
        if str(i['指标英文'])!='nan':
            modelVar_scoreVar_dict[i['指标英文']] = i['输出打分指标名称']

    # 转换原始数据格式
    try:
        data_origin = data_origin.sort_values(['apply_id','var_code','created_at'],ascending=False)\
        .drop_duplicates(['apply_id','var_code','created_at'])
    except:
        try:
            data_origin = data_origin.sort_values(['apply_id','var_code'],ascending=False)\
            .drop_duplicates(['apply_id','var_code'])
        except:
            pass

    data_origin_pivot = data_origin.pivot(index='apply_id',columns='var_code',values='var_val')

    # 找出变量缺失的用户
    wrong_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)>0]
    wrong_data.reset_index(inplace=True)
    # 找出不缺失打分的用户
    right_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)==0]

    right_data_converted = convert_right_data_type(right_data[model_selected],var_dict)[0]

    # 将数据转换为bin
    bin_obj = mt.BinWoe()
    data_origin_pivot_cat = bin_obj.convert_to_category(right_data_converted[model_selected], var_dict\
                                               , rebin_spec_bin_adjusted_modeling)
    data_origin_pivot_score_all = mt.Performance().calculate_score_by_scrd(data_origin_pivot_cat[model_selected]\
                                                               ,score_card_modeling)
    data_origin_pivot_score_vars = data_origin_pivot_score_all[0]
    data_origin_pivot_score_model = data_origin_pivot_score_all[1]


    data_origin_pivot_cat.columns = [i+'_cat' for i in data_origin_pivot_cat.columns]
    data_origin_pivot_score_vars.columns = [i+'_varScoreBymodel' for i in data_origin_pivot_score_vars.columns]

    data_origin_pivot_all = pd.merge(pd.merge(data_origin_pivot,data_origin_pivot_cat,left_index=True,right_index=True)\
             ,data_origin_pivot_score_vars,left_index=True,right_index=True)

    # 将打分错误的变量对应的申请找出来
    wrong_score_vars = {}
    for i in model_selected:
        wrong_dt =data_origin_pivot_all[data_origin_pivot_all\
                                        .apply(lambda x : float(x[modelVar_scoreVar_dict[i]])!=float(x[i+'_varScoreBymodel']),axis=1)]\
                                        [[i,i+'_cat',i+'_varScoreBymodel',modelVar_scoreVar_dict[i]]]

        wrong_score_vars[i] = wrong_dt.reset_index()


    # 整合线上和线下打分
    mlr_score_vars = [i for i in mlr_selected_vars if 'creditscore' in i][0]
    model_on_off_score = pd.merge(data_origin_pivot_score_model.to_frame('back_score'),data_origin_pivot_all[mlr_score_vars]\
         .to_frame('mlr_score'),left_index=True,right_index=True)

    wrong_model_score = model_on_off_score[model_on_off_score\
                                           .apply(lambda x : float(x['back_score']) != float(x['mlr_score']),axis=1)]
    wrong_model_score.reset_index(inplace=True)

    # 求变量的PSI
    var_dist = []
    for i in model_selected:
        tmp = (data_origin_pivot_cat[i+'_cat'].value_counts())\
        .to_frame('allNum').reset_index()
        tmp['dist'] = tmp['allNum']/len(data_origin_pivot_cat)

        tmp['指标英文'] = i
        tmp.columns = ['分箱','allNum','dist','指标英文']
        var_dist.append(tmp)
    var_dist_data = pd.concat(var_dist)[['指标英文','分箱','allNum','dist']]
    var_dist_data['分箱'] = var_dist_data['分箱'].astype(str)
    var_dist_data_with_score_card = pd.merge(score_card_modeling,var_dist_data,on = ['指标英文','分箱'])
    var_dist_data_with_score_card['PSI'] = var_dist_data_with_score_card.apply(lambda x : (x['dist'] - x['分布占比'])\
                           *log(x['dist']/x['分布占比'])\
                           if x['dist']!=0 and x['分布占比']!=0 else 0\
                           ,axis=1)
    var_dist_data_with_score_card = var_dist_data_with_score_card\
    [['指标分类','指标英文','变量中文','数据类型','分箱','N','分布占比','allNum','dist','PSI']]
    # 求打分的psi
    # 拿到打分的cut_bounds
    point_bounds = mt.BinWoe().obtain_boundaries(model_decile_deploy[u'分箱'])['cut_boundaries']

    mlr_score_dist = (pd.cut(data_origin_pivot_all[mlr_score_vars].astype(float),point_bounds)\
    .value_counts().to_frame('allNum')).reset_index()

    mlr_score_dist['dist'] = mlr_score_dist['allNum']/len(data_origin_pivot_all)
    mlr_score_dist['index'] = mlr_score_dist['index'].astype(str)
    model_decile_deploy['样本分布'] = model_decile_deploy['样本数']/sum(model_decile_deploy['样本数'])
    score_dist_with_decile = pd.merge(model_decile_deploy,mlr_score_dist,left_on='分箱',right_on='index')\
    [['分箱','样本数','样本分布','allNum','dist']]

    score_dist_with_decile['PSI'] = score_dist_with_decile.apply(lambda x : (x['dist'] - x['样本分布'])\
                           *log(x['dist']/x['样本分布'])\
                           if x['dist']!=0 and x['样本分布']!=0 else 0\
                           ,axis=1)

    return wrong_data,wrong_score_vars,wrong_model_score,var_dist_data_with_score_card,score_dist_with_decile
import os
import sys
import pickle
import time
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve

import utils3.misc_utils as mu
import utils3.summary_statistics1 as ss
import utils3.feature_selection as fs
fs_obj = fs.FeatureSelection()
import utils3.metrics as mt
pf = mt.Performance()
bw = mt.BinWoe()
import utils3.plot_tools as pt
import matplotlib.pyplot as plt
import utils3.xgboost_model as xm
import utils3.filing as fl
import utils3.data_io_utils as du


## 数据预处理、字典、eda、缺失填充
def data_preprocess(df, in_dict, var_dict=None):
    '''
    输入原始数据,输出缺失填充好的数据、字典、eda结果
    'known_missing': {-999:-9999, -1111:-9999}   #将data中已知表示缺失的值赋值为-9999
    '''
    #index_id=in_dict['index_id']
    #label=in_dict['target']
Beispiel #11
0
    def overall_ranking(self,
                        X,
                        y,
                        var_dict,
                        args_dict,
                        methods,
                        num_max_bins=10,
                        n_clusters=15,
                        verbose=True):
        """
        综合各算法的综合排序

        Args:
        X (pd.DataFrame): x 变量宽表, 已经处理过missing value的。如果没有,会处理。
        y (pd.Series): y label
        var_dict (pd.DataFrame): 变量字典表。必须包含:
            [u'数据源', u'指标英文', u'指标中文', u'数据类型', u'指标类型']
        args_dict (dict): key 是算法名称如['random_forest', 'svm', 'xgboost', 'IV', 'lasso']
            value是dict包含key值grid_search, param.如果没有赋值default值
        methods (list): ['random_forest', 'svm', 'xgboost', 'IV', 'lasso']
        num_max_bins (int): 用于自动分箱决定分箱的数量, default=10

        Returns:
        woe_iv_df (pd.DataFrame): 输出的auto classing的WOE IV表,可以直接贴到输出文档里面
        result (pd.DataFrame): 每个变量用每个方法计算的metrics和相应的ranking还有综合排序
        """
        num_missing = pd.isnull(X).sum().sum()
        if num_missing > 0:
            X = mu.process_missing(X, var_dict, verbose=verbose)

        result_df_list = []
        # IV. calculate_iv里面会分箱,所以需要传入的数据是原始的,只经过缺失处理的数据
        # 计算IV是mandatory
        bin_obj = mt.BinWoe()
        logging.info('Overall ranking prep: auto binning started')
        X_cat, all_encoding_map, all_spec_dict = bin_obj.binning(
            X, y, var_dict, num_max_bins, verbose=verbose)
        logging.info('Overall ranking prep: auto binning completed')
        woe_iv_df = bin_obj.calculate_woe_all(X_cat,
                                              y,
                                              var_dict,
                                              all_spec_dict,
                                              verbose=verbose)
        logging.info('Overall ranking prep: woe calculation completed')
        rebin_spec = bin_obj.create_rebin_spec(woe_iv_df, all_spec_dict,
                                               all_encoding_map)
        iv_result = woe_iv_df[['指标英文', 'IV']].copy()\
                            .rename(columns={'指标英文':'var_code'})\
                            .drop_duplicates().dropna()
        iv_result.loc[:, 'iv_rank'] = iv_result.IV.rank(ascending=False)
        result_df_list.append(iv_result)
        logging.info('Overall ranking prep: IV completed')

        # 将categorical变量label encode,numerical保持原样
        def _encoding_categorical(data, var_dict):
            new_data = data.copy()
            if '数据类型' in var_dict.columns.values:
                cateogory_col = var_dict.loc[var_dict['指标英文'].isin(new_data.columns) &\
                                                  (var_dict['数据类型']=='varchar'), '指标英文'].tolist()
            else:
                cateogory_col = [
                    col for col in new_data.columns
                    if new_data[col].dtype == 'object'
                ]

            for col in cateogory_col:
                # NA不fill成missing的话,会一个NA一个label
                new_data.loc[:, col] = mu.label_encode(data[col])
            return new_data

        X = _encoding_categorical(X, var_dict)

        if 'random_forest' in methods:
            random_forest_result = self.random_forest(X, y, \
                                            args_dict['random_forest'].get('grid_search', False), \
                                            args_dict['random_forest'].get('param', None))
            result_df_list.append(random_forest_result)
            logging.info('Overall ranking prep: Random Forest completed')

        # 太慢了,SVC已经是基于libsvm的,速度没办法更快了
        if 'svm' in methods:
            svm_result = self.svm(X, y)
            result_df_list.append(svm_result)
            logging.info('Overall ranking prep: SVM completed')

        if 'xgboost' in methods:
            xgboost_result = self.xgboost(X, y, \
                                    args_dict['xgboost'].get('grid_search', False), \
                                    args_dict['xgboost'].get('param', None))
            result_df_list.append(xgboost_result)
            logging.info('Overall ranking prep: Xgboost completed')

        if 'lasso' in methods:
            lasso_result = self.lasso(X, y)
            result_df_list.append(lasso_result)
            logging.info('Overall ranking prep: LASSO completed')

        result = reduce(lambda left, right: left.merge(right, on='var_code', how='outer'),\
                        result_df_list)

        rank_columns = [i for i in result.columns if 'rank' in i]
        result.loc[:, 'overall_rank'] = (result[rank_columns] * 1.0).mean(1)
        result = var_dict.loc[:, ['指标英文', '指标中文','数据源']].drop_duplicates()\
                     .merge(result.rename(columns={'var_code': '指标英文'}), \
                            on='指标英文')

        cluster_result, X_transformed = self.variable_clustering(
            X_cat, woe_iv_df, n_clusters=n_clusters)
        result = result.merge(cluster_result, on='指标英文', how='left')

        return X_cat, X_transformed, woe_iv_df, rebin_spec, result