# X_bin_data = pd.concat([X_bin_data,bin_res], axis = 1) #============================================================================== #生成分数 score_data = bin_res_data.replace(dict_code) score_data["score_sum"] = score_data.sum(axis = 1) #拼接y值 scorcarde_data = pd.concat([score_data, loan_best_banning['y']], axis =1) iv_score_sum = step01_feature_engine.filter_iv(scorcarde_data, group=10) iv_score_sum[1].to_excel(r"F:\TS\offline_model\02_DataProcess\05_result_score\model_result6_Jan.xlsx") score_group = iv_score_sum[1] #画个图看下分数的分布情况 step06_draw_plot.drawHistogram(scorcarde_data['score_sum']) v_feat = ['score_sum'] step02_modle_plot.prob_density(scorcarde_data, v_feat) '''评分卡''' fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 #plt.subplot2grid((2,3),(1,0), colspan=2) scorcarde_data.score_sum[scorcarde_data.y == 0].plot(kind='kde') scorcarde_data.score_sum[scorcarde_data.y == 1].plot(kind='kde') plt.xlabel(u"score_sum")# plots an axis lable plt.ylabel(u"density") plt.title(u"Distribution of score_sum") plt.legend((u'good', u'bad'),loc='best') # sets our legend for our graph. #KS值>0.2就可认为模型有比较好的预测准确性
1.0 104 7.5% ''' a = step01_feature_engine.fill_null_data(lf) #============================================================================== #绘图 objectColumns = lf.select_dtypes(include=["object"]).columns var = lf[objectColumns].columns for i in var: step06_draw_plot.drawBar(lf[i]) objectColumns = lf.select_dtypes(include=["float"]).columns var = lf[objectColumns].columns for i in var: step06_draw_plot.drawHistogram(lf[i]) #同值化检查 lf2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio( lf, ratiolimit=0.931) #打印字符型变量 step01_feature_engine.check_feature_binary(lf2) #观察各个离散值的分布情况 step01_feature_engine.watch_obj(lf2) # 构建mapping,对有序变量进行转换 mapping_dict1 = { "var1": { "无": 0,
'薪资发放方式', '子女个数', '贷款申请期限', 'group_level', 'risk_level', ] for i in var: step06_draw_plot.drawBar(df4[i]) v_feat = [ '申请贷款金额', '工作年限', '年龄', '月收入', '月其他收入', '社保公积基数', 'score', '贷款月还', '信用卡月还', '准贷记卡月还', '其他负债', '简版汇总负债总计', '征信负债率', '信用卡使用率', '计算年收入', '计算负债率1', '计算负债率2' ] for i in v_feat: step06_draw_plot.drawHistogram(df4[i]) def prob_density(data, v_feat): font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=10) plt.figure(figsize=(16, 20 * 4)) gs = gridspec.GridSpec(20, 1) for i, cn in enumerate(data[v_feat]): ax = plt.subplot(gs[i]) sns.distplot(data[cn][data["y"] == 1], bins=50) sns.distplot(data[cn][data["y"] == 0], bins=100) ax.set_xlabel('') ax.set_title('histogram of feature: ' + str(cn), fontproperties=font)