def rf_importance(data): import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier names = data.columns X, y = step01_feature_engine.x_y_data(data) clf = RandomForestClassifier(n_estimators=10, random_state=123) #构建分类随机森林分类器 clf.fit(X, y) #对自变量和因变量进行拟合 names, clf.feature_importances_ for feature in zip(names, clf.feature_importances_): print(feature) #plt.style.use('fivethirtyeight') #plt.rcParams['figure.figsize'] = (10,5) ## feature importances 可视化## importances = clf.feature_importances_ feat_names = names indices = np.argsort(importances)[::-1] #fig = plt.figure(figsize=(14,10)) plt.title("Feature importances") plt.bar(range(len(indices)), importances[indices], color='lightblue', align="center") plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical', fontsize=10) plt.xlim([-1, len(indices)]) plt.show()
#多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60) print('保留了变量有:', len(per_col)) print(per_col) #136个变量,保留37个 lf5 = new_data[[ 'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53', 'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114', 'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135', 'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133', 'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y' ]] pearson_coef = step02_modle_plot.plot_pearson(lf5) #再次观察共线情况 lf5.to_csv(r"C:\Users\Administrator\Desktop\data.csv") data, iv_value = step01_feature_engine.filter_iv(lf5, group=5) iv_value.to_excel( r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value_2.xls") X, y = step01_feature_engine.x_y_data(data) vif_data = step01_feature_engine.judge_vif(X) #两个变量VIF>10,共线 X, y = step01_feature_engine.smote_data(X, y) model = step03_built_modle.baseline_model(X, y)
#基于卡方的最优分箱 # one_woe = pd.DataFrame([]) # new_col = list(data_all.columns) # new_col.remove('near_open_percosloan') # new_col.remove('y') # # for var in new_col: # new_woe = new_iv.ChiMerge(data_all, var, 'y') # one_woe = one_woe.append(new_woe) # print(var) # # csvfile = r"F:\TS\offline_model\01_Dataset\04_Output\chi_iv\chi_iv_all.csv" # one_woe.to_csv(csvfile,sep=',',index=False ,encoding = 'utf-8') #基于step02_bining的最优分箱 X, y = step01_feature_engine.x_y_data(data_all) new_col = ['work_years','social_fund_basenum','cal_yearly_income','external_debat_ratio', 'cal_debat_ratio1', 'selfquery_cardquery_in6m', 'selfquery_loquery_in3m', 'card_num_fo', 'selfquery_loquery_cardquery_in1m', 'selfquery_in3m_min_interval', 'max_cardline', 'max_loanline', 'manaquery_in24m_def', 'near_open_loan', 'max_carloan_line', 'near_newopen_carloan', 'pettyloan_loquery_in6m', 'cardquery_in24m', 'near_house_loan', 'cardquery_in6m_max', 'selfquery5_in12m', 'card_num', 'selfquery6_in1m', 'can_card_num', 'cardquery_in3m', 'manaquery_in24m_f', 'max_percosloan_line',] woe_bin_data=pd.DataFrame() for var in new_col: woe_bin = step02_bining.binContVar(X[var], y, method=4) woe_bin['var_name'] =var woe_bin_data = woe_bin_data.append(woe_bin) print(var)
print('保留了变量有:', len(per_col_all)) print(per_col_all) #126个变量里面80个共线,36个变量不共线,保留36个 df_data_last = new_data_all[[ 'selfquery_cardquery_in3m', 'score', 'card_cardquery_rate', 'housing_nature_g', 'local_nolocal_g', 'mean_cardline', 'normal_card_num', 'selfquery_in3m_min_interval', 'max_loanline', 'sum_carloan_line', 'monthly_other_income', 'near_newopen_carloan', 'company_nature_g', 'can_card_rate', 'credit_use_ratio', 'far_open_loan', 'desired_loan_amount', 'near_open_loan', 'inac_card_rate', 'age', 'od_card_rate', 'monthly_salary', 'min_cardline', 'clear_loan_num', 'education_g', 'min_cardline_f', 'near_open_percosloan', 'sex_g', 'bus_loan_num', 'manaquery_in6m_f', 'com_insurquery_max', 'inac_card_num', 'child_count', 'y' ]] X, y = step01_feature_engine.x_y_data(df_data_last) ##逻辑回归对共线性敏感,判断下VIF ##当VIF大于5或10时,代表模型存在严重的共线性问题 #所有自变量的VIF均低于10,说明自变量之间并不存在多重共线性的隐患。 #============================================================================== # from statsmodels.stats.outliers_influence import variance_inflation_factor as vif #vif_data = pd.DataFrame([]) #vif_data["VIF_Factor"] = [vif(X.values, i) for i in range(X.shape[1])] #vif_data["features"] = X.columns #============================================================================== vif_data = step01_feature_engine.judge_vif(X) #两个变量VIF>10,共线 data_all_last = df_data_last.drop(["age"], axis=1) csvfile = r"F:\TS\offline_model\01_Dataset\02_Interim\all_data\data_loan.csv" data_all_last.to_csv(csvfile, sep=',', index=False, encoding='utf-8')
'self_query_24_month_frequency', 'loan__query_24_month_frequency', 'card_apply_03_month_frequency', 'card_60_pastdue_frequency', 'max_cardline', 'selfquery_cardquery_in6m', 'cardquery_card_num_dvalue', 'y', 'company_nature_g_0', 'company_nature_g_1', 'company_nature_g_2', 'company_nature_g_3', 'company_nature_g_4', 'company_nature_g_5', 'housing_nature_g_0.0', 'housing_nature_g_1.0', 'housing_nature_g_2.0', 'housing_nature_g_3.0', 'housing_nature_g_4.0', 'sex_g_0', 'sex_g_1', 'local_nolocal_g_0', 'local_nolocal_g_1', 'education_g_0', 'education_g_1', 'education_g_2', 'education_g_3', 'y' ]] ##皮尔森系数删除多重共线的变量 pearson_coef = step02_modle_plot.plot_pearson(df7) ##构造X,y变量 X, y = step01_feature_engine.x_y_data(new_data3) X, y = step01_feature_engine.x_y_data(df7) ##特征缩放,标准化 X = step01_feature_engine.standard_scaler(X) ##方法一:递归消除算法 #X, y = step01_feature_engine.wrapper_data(X, y,n_features_to_select = 15) ##方法二: 随机逻辑回归 scoretable, X_picked = step01_feature_engine.rdlg_variables(X, y, threshold=0.15) #==============================================================================
#'card_num', #'near_newopen_carloan', #'com_loquery_max_in3m', # 'work_years', #'near_open_percosloan', #'min_cardline_f', #'normal_card_num', #'other_debet', #'clear_loan_num', 'married_g', #'cal_debat_ratio2', "y"]] print(loan_best_banning.shape[1]-1) #观察变量相关性 X, y = step01_feature_engine.x_y_data(loan_best_banning) ##逻辑回归对共线性敏感,判断下VIF ##当VIF大于5或10时,代表模型存在严重的共线性问题 #所有自变量的VIF均低于10,说明自变量之间并不存在多重共线性的隐患。 vif_data = step01_feature_engine.judge_vif(X) #3个变量VIF>5,共线 pearson_coef = step02_modle_plot.plot_pearson(loan_best_banning) #导入WOE #woe = pd.read_excel(r"F:\TS\offline_model\02_DataProcess\03_best_IV\02_read_woe_01.xlsx") woe = pd.read_excel(r"F:\TS\offline_model\output\02_best_iv\02_best_iv.xlsx") print(len(woe.var_name.drop_duplicates())) X, y = step01_feature_engine.x_y_data(loan_best_banning)