#多变量分析,保留相关性低于阈值0.6的变量
#对产生的相关系数矩阵进行比较,并删除IV比较小的变量
per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60)
print('保留了变量有:', len(per_col))
print(per_col)  #136个变量,保留37个
lf5 = new_data[[
    'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53',
    'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114',
    'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135',
    'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133',
    'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y'
]]

pearson_coef = step02_modle_plot.plot_pearson(lf5)  #再次观察共线情况

lf5.to_csv(r"C:\Users\Administrator\Desktop\data.csv")

data, iv_value = step01_feature_engine.filter_iv(lf5, group=5)

iv_value.to_excel(
    r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value_2.xls")

X, y = step01_feature_engine.x_y_data(data)

vif_data = step01_feature_engine.judge_vif(X)  #两个变量VIF>10,共线

X, y = step01_feature_engine.smote_data(X, y)

model = step03_built_modle.baseline_model(X, y)
Esempio n. 2
0
# pvals = pvals.to_dict()
# 
#==============================================================================


#==============================================================================
# from sklearn.preprocessing import StandardScaler # 导入模块
# sc = StandardScaler()
# X[Col] = sc.fit_transform(X[Col])
#==============================================================================


##处理样本不平衡;当样本过少的时候建议采用这个方法
X, y = step01_feature_engine.smote_data(X, y)

model = step03_built_modle.baseline_model(X, y)
'''
confusion_matrix 
 [[1531  861]
 [ 764 1628]]
accuracy_score 0.668147373922
precision_score 0.663346613546
recall_score 0.691588785047
ROC_AUC is 0.728932086353
K-S score 0.354737526648'''


'''or: '''   

model = log_model_test(X,y)