a = step01_feature_engine.fill_null_data(lf)
#==============================================================================
#绘图
objectColumns = lf.select_dtypes(include=["object"]).columns
var = lf[objectColumns].columns
for i in var:
    step06_draw_plot.drawBar(lf[i])

objectColumns = lf.select_dtypes(include=["float"]).columns
var = lf[objectColumns].columns
for i in var:
    step06_draw_plot.drawHistogram(lf[i])

#同值化检查
lf2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio(
    lf, ratiolimit=0.931)

#打印字符型变量
step01_feature_engine.check_feature_binary(lf2)

#观察各个离散值的分布情况
step01_feature_engine.watch_obj(lf2)

# 构建mapping,对有序变量进行转换
mapping_dict1 = {
    "var1": {
        "无": 0,
        "近151-180天": 1,
        "近121-150天": 1,
        "近91-120天": 1,
        "近61-90天": 1,
Example #2
0
    'same_com_lo_qurry_num_3m', 'same_com_lo_card_num_3m',
    'same_com_insur_qurry_num_3m', 'self_loan_dv_in1m', 'self_loan_dv_in3m',
    'self_loan_dv_in6m', 'self_loan_dv_in12m', 'self_loan_dv_in24m',
    'self_card_query_in6m', 'self_card_query_in3m', 'self_card_query_in1m',
    'self_card_query_in12m', 'self_card_query_in24m', 'self_loan_query_in6m',
    'self_loan_query_in3m', 'self_loan_query_in1m', 'self_loan_query_in12m',
    'self_loan_query_in24m', 'self_loan_query_de_f_in6m',
    'self_loan_query_de_f_in3m', 'self_loan_query_de_f_in1m',
    'self_loan_query_de_f_in12m', 'self_loan_query_de_f_in24m',
    'self_loan_card_query_in6m', 'self_loan_card_query_in3m',
    'self_loan_card_query_in1m', 'self_loan_card_query_in12m',
    'self_loan_card_query_in24m', 'y'
]]

#同值化检查
df2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio(
    df1, ratiolimit=0.95)
#查看缺失值情况
df3, null_ratio = step01_feature_engine.select_null_ratio(df2)

var = list(df2.columns)
for i in var:
    step06_draw_plot.drawHistogram(df[i])

#查看缺失值情况
#step01_feature_engine.fill_null_data(df3)
df3.isnull().sum(axis=0).sort_values(ascending=False)
null_ratio = step01_feature_engine.select_null_ratio(df3)

df4 = df3.fillna(0)
df4.isnull().sum(axis=0).sort_values(ascending=False)