var = ["var1", "var2", "var14", "var15"]
str_value = step01_feature_engine.str_ratio(lf3, var)

#IV保留大于0.02的变量,170个变量保留126个
new_data, iv_value = step01_feature_engine.filter_iv(lf3, group=10)

#对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据
list_value = iv_value[iv_value.ori_IV >= 0.02].var_name
iv_sort_columns = list(list_value.drop_duplicates())
lf4 = new_data[iv_sort_columns]

iv_value.to_excel(r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value.xls")

##皮尔森系数绘图,观察多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(lf4)

#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60)
print('保留了变量有:', len(per_col))
print(per_col)  #136个变量,保留37个
lf5 = new_data[[
    'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53',
    'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114',
    'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135',
    'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133',
    'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y'
]]

pearson_coef = step02_modle_plot.plot_pearson(lf5)  #再次观察共线情况
Exemple #2
0
#查看缺失值情况
#step01_feature_engine.fill_null_data(df3)
df3.isnull().sum(axis=0).sort_values(ascending=False)
null_ratio = step01_feature_engine.select_null_ratio(df3)

df4 = df3.fillna(0)
df4.isnull().sum(axis=0).sort_values(ascending=False)

#IV保留大于0.02的变量
new_data, iv_value = step01_feature_engine.filter_iv(df4, group=5)
iv_value.to_excel(
    r"F:\TS\offline_model\01_Dataset\04_Output\credit\iv_value_credit.xls")

##皮尔森系数删除多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(new_data)

df5 = new_data.drop([
    'selfquery_in12m', 'card_query_in12m', 'query_in3m_1', 'selfquery6_in12m',
    'use_credit_card_numb', 'selfquery6_in24m', 'self_loan_query_de_f_in24m',
    'self_loan_dv_in12m', 'mana_loan_in12m_de_f', 'query_in12m',
    'mana_loan_in24m', 'lo_query_in12m', 'lo_query_in6m', 'lo_query_in6m_de_f',
    'self_loan_card_query_in12m', 'lo_query_in24m_de_f', 'mana_loan_in1m',
    'query_in1m', 'query_in6m', 'mana_loan_in3m', 'mana_loan_in12m',
    'self_loan_query_de_f_in1m', 'self_loan_query_in3m',
    'self_loan_query_in6m', 'normal_card_num', 'self_loan_card_query_in12m',
    'self_loan_card_query_in6m', 'lo_query_in12m_de_f', 'selfquery_in24m',
    'self_loan_query_de_f_in12m', 'max_card_line', 'self_card_query_in24m',
    'selfquery6_in6m', 'selfquery6_in3m', 'self_card_query_in1m',
    'self_loan_card_query_in3m', 'card_query_in3m', 'self_loan_query_in1m',
    'max_loanline', 'card_query_in1m_max', 'self_card_query_in12m',
Exemple #3
0
#==============================================================================

#IV保留大于0.02的变量,170个变量保留126个
new_data, iv_value = step01_feature_engine.filter_iv(df4, group=10)

#对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据
list_value = iv_value[iv_value.ori_IV >= 0.02].var_name
iv_sort_columns = list(list_value.drop_duplicates())
df5 = new_data[iv_sort_columns]

iv_value.to_excel(
    r"F:\TS\offline_model\01_Dataset\04_Output\lycredit\iv_value_lycredit_group10.xls"
)

##皮尔森系数绘图,观察多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(df5)

#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60)
print('保留了变量有:', len(per_col))
print(per_col)  #126个变量,保留21个
df6 = new_data[[
    'selfquery_cardquery_in3m', 'card_cardquery_rate', 'mean_cardline',
    'normal_card_num', 'selfquery_in3m_min_interval', 'max_loanline',
    'sum_carloan_line', 'near_newopen_carloan', 'can_card_rate',
    'far_open_loan', 'near_open_loan', 'inac_card_rate', 'od_card_rate',
    'min_cardline', 'clear_loan_num', 'min_cardline_f', 'near_open_percosloan',
    'bus_loan_num', 'manaquery_in6m_f', 'com_insurquery_max', 'inac_card_num',
    'y'
]]
Exemple #4
0
# for i in var:
#     step06_draw_plot.drawHistogram(df4[i])
#==============================================================================

#IV保留大于0.02的变量,170个变量保留126个
new_data,iv_value = step01_feature_engine.filter_iv(df2, group=10)

#对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据
list_value = iv_value[iv_value.ori_IV >= 0.02].var_name
iv_sort_columns = list(list_value.drop_duplicates())
df3 = new_data[iv_sort_columns]

iv_value.to_excel(r"F:\TS\offline_model\01_Dataset\04_Output\01_refresh\iv_value_refresh01.xls")

##皮尔森系数绘图,观察多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(df3)

#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold = 0.60)
print ('保留了变量有:',len(per_col))
print (per_col)   #126个变量,保留33个
df4 = new_data[['selfquery_cardquery_in6m','selfquery_loquery_in3m', 'card_num_fo',
       'selfquery_loquery_cardquery_in1m', 'selfquery_in3m_min_interval',
       'max_cardline', 'max_loanline', 'manaquery_in24m_def',
       'unclear_monthpay', 'min_cardline', 'near_open_loan',
       'max_carloan_line', 'min_cardline_f', 'near_newopen_carloan',
       'pettyloan_loquery_in6m', 'cardquery_in24m', 'near_house_loan',
       'inac_card_rate', 'card_cardquery_rate', 'cardquery_in6m_max',
       'selfquery5_in12m', 'card_num', 'selfquery6_in1m', 'manaquery_in1m_def',
       'can_card_num', 'cardquery_in3m', 'manaquery_in24m_f', 'clear_loan_num',
df6.insert(152, 'y', last)

#======================================== 变量筛选,IV值和相关系数======================================

#IV保留大于0.02的变量,156个变量保留119个
new_data,iv_value = step01_feature_engine.filter_iv(df6, group=10)

#对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据
list_value = iv_value[iv_value.ori_IV >= 0.02].var_name
iv_sort_columns = list(list_value.drop_duplicates())
df7 = new_data[iv_sort_columns]

iv_value.to_excel(r"F:\TS\offline_model\output\iv_value_group01.xls")

##皮尔森系数绘图,观察多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(df7)

#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold = 0.50)
print ('保留了变量有:',len(per_col))
print (per_col)   #135个变量,保留49个
df8 = df6[[                 
        'selfquery_cardquery_in6m', 'group_level', 'housing_nature_g',
       'local_nolocal_g', 'selfquery_loquery_cardquery_in1m',
       'social_fund_basenum', 'pettyloan_loquery_in6m',
       'selfquery_in3m_min_interval', 'asset_g', 'desired_loan_amt_g',
       'max_cardline_f', 'use_card_num', 'card_cardquery_rate', 'apply_city_g',
       'near_open_loan', 'month_other_income_g', 'company_nature_g',
       'max_loanline', 'far_open_loan', 'near_newopen_carloan', 'loan_num',
       'min_cardline', 'sex_g', 'card_num', 'other_debet',