var = ["var1", "var2", "var14", "var15"] str_value = step01_feature_engine.str_ratio(lf3, var) #IV保留大于0.02的变量,170个变量保留126个 new_data, iv_value = step01_feature_engine.filter_iv(lf3, group=10) #对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据 list_value = iv_value[iv_value.ori_IV >= 0.02].var_name iv_sort_columns = list(list_value.drop_duplicates()) lf4 = new_data[iv_sort_columns] iv_value.to_excel(r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value.xls") ##皮尔森系数绘图,观察多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(lf4) #多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60) print('保留了变量有:', len(per_col)) print(per_col) #136个变量,保留37个 lf5 = new_data[[ 'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53', 'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114', 'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135', 'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133', 'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y' ]] pearson_coef = step02_modle_plot.plot_pearson(lf5) #再次观察共线情况
#查看缺失值情况 #step01_feature_engine.fill_null_data(df3) df3.isnull().sum(axis=0).sort_values(ascending=False) null_ratio = step01_feature_engine.select_null_ratio(df3) df4 = df3.fillna(0) df4.isnull().sum(axis=0).sort_values(ascending=False) #IV保留大于0.02的变量 new_data, iv_value = step01_feature_engine.filter_iv(df4, group=5) iv_value.to_excel( r"F:\TS\offline_model\01_Dataset\04_Output\credit\iv_value_credit.xls") ##皮尔森系数删除多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(new_data) df5 = new_data.drop([ 'selfquery_in12m', 'card_query_in12m', 'query_in3m_1', 'selfquery6_in12m', 'use_credit_card_numb', 'selfquery6_in24m', 'self_loan_query_de_f_in24m', 'self_loan_dv_in12m', 'mana_loan_in12m_de_f', 'query_in12m', 'mana_loan_in24m', 'lo_query_in12m', 'lo_query_in6m', 'lo_query_in6m_de_f', 'self_loan_card_query_in12m', 'lo_query_in24m_de_f', 'mana_loan_in1m', 'query_in1m', 'query_in6m', 'mana_loan_in3m', 'mana_loan_in12m', 'self_loan_query_de_f_in1m', 'self_loan_query_in3m', 'self_loan_query_in6m', 'normal_card_num', 'self_loan_card_query_in12m', 'self_loan_card_query_in6m', 'lo_query_in12m_de_f', 'selfquery_in24m', 'self_loan_query_de_f_in12m', 'max_card_line', 'self_card_query_in24m', 'selfquery6_in6m', 'selfquery6_in3m', 'self_card_query_in1m', 'self_loan_card_query_in3m', 'card_query_in3m', 'self_loan_query_in1m', 'max_loanline', 'card_query_in1m_max', 'self_card_query_in12m',
#============================================================================== #IV保留大于0.02的变量,170个变量保留126个 new_data, iv_value = step01_feature_engine.filter_iv(df4, group=10) #对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据 list_value = iv_value[iv_value.ori_IV >= 0.02].var_name iv_sort_columns = list(list_value.drop_duplicates()) df5 = new_data[iv_sort_columns] iv_value.to_excel( r"F:\TS\offline_model\01_Dataset\04_Output\lycredit\iv_value_lycredit_group10.xls" ) ##皮尔森系数绘图,观察多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(df5) #多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60) print('保留了变量有:', len(per_col)) print(per_col) #126个变量,保留21个 df6 = new_data[[ 'selfquery_cardquery_in3m', 'card_cardquery_rate', 'mean_cardline', 'normal_card_num', 'selfquery_in3m_min_interval', 'max_loanline', 'sum_carloan_line', 'near_newopen_carloan', 'can_card_rate', 'far_open_loan', 'near_open_loan', 'inac_card_rate', 'od_card_rate', 'min_cardline', 'clear_loan_num', 'min_cardline_f', 'near_open_percosloan', 'bus_loan_num', 'manaquery_in6m_f', 'com_insurquery_max', 'inac_card_num', 'y' ]]
# for i in var: # step06_draw_plot.drawHistogram(df4[i]) #============================================================================== #IV保留大于0.02的变量,170个变量保留126个 new_data,iv_value = step01_feature_engine.filter_iv(df2, group=10) #对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据 list_value = iv_value[iv_value.ori_IV >= 0.02].var_name iv_sort_columns = list(list_value.drop_duplicates()) df3 = new_data[iv_sort_columns] iv_value.to_excel(r"F:\TS\offline_model\01_Dataset\04_Output\01_refresh\iv_value_refresh01.xls") ##皮尔森系数绘图,观察多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(df3) #多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold = 0.60) print ('保留了变量有:',len(per_col)) print (per_col) #126个变量,保留33个 df4 = new_data[['selfquery_cardquery_in6m','selfquery_loquery_in3m', 'card_num_fo', 'selfquery_loquery_cardquery_in1m', 'selfquery_in3m_min_interval', 'max_cardline', 'max_loanline', 'manaquery_in24m_def', 'unclear_monthpay', 'min_cardline', 'near_open_loan', 'max_carloan_line', 'min_cardline_f', 'near_newopen_carloan', 'pettyloan_loquery_in6m', 'cardquery_in24m', 'near_house_loan', 'inac_card_rate', 'card_cardquery_rate', 'cardquery_in6m_max', 'selfquery5_in12m', 'card_num', 'selfquery6_in1m', 'manaquery_in1m_def', 'can_card_num', 'cardquery_in3m', 'manaquery_in24m_f', 'clear_loan_num',
df6.insert(152, 'y', last) #======================================== 变量筛选,IV值和相关系数====================================== #IV保留大于0.02的变量,156个变量保留119个 new_data,iv_value = step01_feature_engine.filter_iv(df6, group=10) #对数据按照IV大小顺序进行排序,以便于使用fillter_pearson删除相关性较高里面IV值低的数据 list_value = iv_value[iv_value.ori_IV >= 0.02].var_name iv_sort_columns = list(list_value.drop_duplicates()) df7 = new_data[iv_sort_columns] iv_value.to_excel(r"F:\TS\offline_model\output\iv_value_group01.xls") ##皮尔森系数绘图,观察多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(df7) #多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold = 0.50) print ('保留了变量有:',len(per_col)) print (per_col) #135个变量,保留49个 df8 = df6[[ 'selfquery_cardquery_in6m', 'group_level', 'housing_nature_g', 'local_nolocal_g', 'selfquery_loquery_cardquery_in1m', 'social_fund_basenum', 'pettyloan_loquery_in6m', 'selfquery_in3m_min_interval', 'asset_g', 'desired_loan_amt_g', 'max_cardline_f', 'use_card_num', 'card_cardquery_rate', 'apply_city_g', 'near_open_loan', 'month_other_income_g', 'company_nature_g', 'max_loanline', 'far_open_loan', 'near_newopen_carloan', 'loan_num', 'min_cardline', 'sex_g', 'card_num', 'other_debet',