Example #1
0
def rf_importance(data):
    import matplotlib.pyplot as plt
    from sklearn.ensemble import RandomForestClassifier
    names = data.columns
    X, y = step01_feature_engine.x_y_data(data)

    clf = RandomForestClassifier(n_estimators=10,
                                 random_state=123)  #构建分类随机森林分类器
    clf.fit(X, y)  #对自变量和因变量进行拟合
    names, clf.feature_importances_
    for feature in zip(names, clf.feature_importances_):
        print(feature)

    #plt.style.use('fivethirtyeight')
    #plt.rcParams['figure.figsize'] = (10,5)

    ## feature importances 可视化##
    importances = clf.feature_importances_
    feat_names = names
    indices = np.argsort(importances)[::-1]
    #fig = plt.figure(figsize=(14,10))
    plt.title("Feature importances")
    plt.bar(range(len(indices)),
            importances[indices],
            color='lightblue',
            align="center")
    plt.xticks(range(len(indices)),
               feat_names[indices],
               rotation='vertical',
               fontsize=10)
    plt.xlim([-1, len(indices)])
    plt.show()
#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60)
print('保留了变量有:', len(per_col))
print(per_col)  #136个变量,保留37个
lf5 = new_data[[
    'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53',
    'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114',
    'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135',
    'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133',
    'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y'
]]

pearson_coef = step02_modle_plot.plot_pearson(lf5)  #再次观察共线情况

lf5.to_csv(r"C:\Users\Administrator\Desktop\data.csv")

data, iv_value = step01_feature_engine.filter_iv(lf5, group=5)

iv_value.to_excel(
    r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value_2.xls")

X, y = step01_feature_engine.x_y_data(data)

vif_data = step01_feature_engine.judge_vif(X)  #两个变量VIF>10,共线

X, y = step01_feature_engine.smote_data(X, y)

model = step03_built_modle.baseline_model(X, y)
Example #3
0
#基于卡方的最优分箱
# one_woe = pd.DataFrame([])
# new_col = list(data_all.columns)
# new_col.remove('near_open_percosloan')
# new_col.remove('y')
# 
# for var in new_col:
#     new_woe = new_iv.ChiMerge(data_all, var, 'y')
#     one_woe = one_woe.append(new_woe)
#     print(var)
#     
# csvfile = r"F:\TS\offline_model\01_Dataset\04_Output\chi_iv\chi_iv_all.csv"
# one_woe.to_csv(csvfile,sep=',',index=False ,encoding = 'utf-8')

#基于step02_bining的最优分箱
X, y = step01_feature_engine.x_y_data(data_all)
new_col = ['work_years','social_fund_basenum','cal_yearly_income','external_debat_ratio',
           'cal_debat_ratio1', 'selfquery_cardquery_in6m',
           'selfquery_loquery_in3m', 'card_num_fo',
           'selfquery_loquery_cardquery_in1m', 'selfquery_in3m_min_interval',
           'max_cardline', 'max_loanline', 'manaquery_in24m_def', 'near_open_loan',
           'max_carloan_line', 'near_newopen_carloan', 'pettyloan_loquery_in6m',
           'cardquery_in24m', 'near_house_loan', 'cardquery_in6m_max',
           'selfquery5_in12m', 'card_num', 'selfquery6_in1m', 'can_card_num',
           'cardquery_in3m', 'manaquery_in24m_f', 'max_percosloan_line',]
woe_bin_data=pd.DataFrame()
for var in new_col:
    woe_bin = step02_bining.binContVar(X[var], y, method=4)
    woe_bin['var_name'] =var    
    woe_bin_data = woe_bin_data.append(woe_bin)
    print(var)
Example #4
0
print('保留了变量有:', len(per_col_all))
print(per_col_all)  #126个变量里面80个共线,36个变量不共线,保留36个
df_data_last = new_data_all[[
    'selfquery_cardquery_in3m', 'score', 'card_cardquery_rate',
    'housing_nature_g', 'local_nolocal_g', 'mean_cardline', 'normal_card_num',
    'selfquery_in3m_min_interval', 'max_loanline', 'sum_carloan_line',
    'monthly_other_income', 'near_newopen_carloan', 'company_nature_g',
    'can_card_rate', 'credit_use_ratio', 'far_open_loan',
    'desired_loan_amount', 'near_open_loan', 'inac_card_rate', 'age',
    'od_card_rate', 'monthly_salary', 'min_cardline', 'clear_loan_num',
    'education_g', 'min_cardline_f', 'near_open_percosloan', 'sex_g',
    'bus_loan_num', 'manaquery_in6m_f', 'com_insurquery_max', 'inac_card_num',
    'child_count', 'y'
]]

X, y = step01_feature_engine.x_y_data(df_data_last)
##逻辑回归对共线性敏感,判断下VIF
##当VIF大于5或10时,代表模型存在严重的共线性问题
#所有自变量的VIF均低于10,说明自变量之间并不存在多重共线性的隐患。
#==============================================================================
# from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
#vif_data = pd.DataFrame([])
#vif_data["VIF_Factor"] = [vif(X.values, i) for i in range(X.shape[1])]
#vif_data["features"] = X.columns
#==============================================================================
vif_data = step01_feature_engine.judge_vif(X)  #两个变量VIF>10,共线

data_all_last = df_data_last.drop(["age"], axis=1)

csvfile = r"F:\TS\offline_model\01_Dataset\02_Interim\all_data\data_loan.csv"
data_all_last.to_csv(csvfile, sep=',', index=False, encoding='utf-8')
Example #5
0
    'self_query_24_month_frequency', 'loan__query_24_month_frequency',
    'card_apply_03_month_frequency', 'card_60_pastdue_frequency',
    'max_cardline', 'selfquery_cardquery_in6m', 'cardquery_card_num_dvalue',
    'y', 'company_nature_g_0', 'company_nature_g_1', 'company_nature_g_2',
    'company_nature_g_3', 'company_nature_g_4', 'company_nature_g_5',
    'housing_nature_g_0.0', 'housing_nature_g_1.0', 'housing_nature_g_2.0',
    'housing_nature_g_3.0', 'housing_nature_g_4.0', 'sex_g_0', 'sex_g_1',
    'local_nolocal_g_0', 'local_nolocal_g_1', 'education_g_0', 'education_g_1',
    'education_g_2', 'education_g_3', 'y'
]]

##皮尔森系数删除多重共线的变量
pearson_coef = step02_modle_plot.plot_pearson(df7)

##构造X,y变量
X, y = step01_feature_engine.x_y_data(new_data3)
X, y = step01_feature_engine.x_y_data(df7)

##特征缩放,标准化
X = step01_feature_engine.standard_scaler(X)

##方法一:递归消除算法
#X, y = step01_feature_engine.wrapper_data(X, y,n_features_to_select = 15)

##方法二: 随机逻辑回归
scoretable, X_picked = step01_feature_engine.rdlg_variables(X,
                                                            y,
                                                            threshold=0.15)

#==============================================================================
       #'card_num', 
       #'near_newopen_carloan', 
       #'com_loquery_max_in3m', 
     #  'work_years',
       #'near_open_percosloan',
       #'min_cardline_f', 
       #'normal_card_num',
       #'other_debet', 
       #'clear_loan_num',
       'married_g', 
       #'cal_debat_ratio2',                           
       "y"]]
print(loan_best_banning.shape[1]-1)
#观察变量相关性

X, y = step01_feature_engine.x_y_data(loan_best_banning)

##逻辑回归对共线性敏感,判断下VIF
##当VIF大于5或10时,代表模型存在严重的共线性问题
#所有自变量的VIF均低于10,说明自变量之间并不存在多重共线性的隐患。
vif_data = step01_feature_engine.judge_vif(X) #3个变量VIF>5,共线

pearson_coef = step02_modle_plot.plot_pearson(loan_best_banning)


#导入WOE
#woe = pd.read_excel(r"F:\TS\offline_model\02_DataProcess\03_best_IV\02_read_woe_01.xlsx")
woe = pd.read_excel(r"F:\TS\offline_model\output\02_best_iv\02_best_iv.xlsx")
print(len(woe.var_name.drop_duplicates()))

X, y = step01_feature_engine.x_y_data(loan_best_banning)