# seed=1000 #随机种子 # #eval_metric= 'auc') # In[300]: #模型保存 #bst.save_model('0001.model') ##计算训练和测试的ks train_pred = bst.predict(dtrain) train_labels = dtrain.get_label() test_pred = bst.predict(dtest) train_labels = dtest.get_label() train_perf = sc.perf_eva(y_train, train_pred, title = "train") test_perf = sc.perf_eva(y_test, test_pred, title = "test") # PSI # 建立机器学习模型 #train_p = train_pred.copy() #test_p = test_pred.copy() print("各样本prediction为", preds) # In[285]: train_pred.shape
cm =confusion_matrix(y_train, train_pred_new) TPR = round(cm[0,0]/(cm[0,0]+cm[0,1]),4) FPR = round(cm[1,0]/(cm[1,0]+cm[1,1]),4) ks = round(TPR - FPR,4) #dpd_rate = round(cm[1,0]/(cm[0,0]+cm[1,0]),4) acc = round((cm[0,0]+cm[1,1])/cm.sum(),4) print(i, TPR, FPR, ks, acc) #测试集 #test_pred_new = np.array(pd.DataFrame(test_pred).iloc[:, 0].apply(lambda x: 0 if x<0.35 else 1)) proba_range = np.arange(0.2,0.9,0.01) #print(proba_range) print('阈值', '真正率', '假正率', 'ks', '准确率') for i in proba_range: i = round(i,2) test_pred_new = np.array(pd.DataFrame(test_pred).iloc[:, 0].apply(lambda x: 0 if x<i else 1)) cm =confusion_matrix(y_test, test_pred_new) TPR = round(cm[0,0]/(cm[0,0]+cm[0,1]),4) FPR = round(cm[1,0]/(cm[1,0]+cm[1,1]),4) ks = round(TPR - FPR,4) #dpd_rate = round(cm[1,0]/(cm[0,0]+cm[1,0]),4) acc = round((cm[0,0]+cm[1,1])/cm.sum(),4) dpd = round(cm[1,0]/(cm[0,0]+cm[1,0]),4) print(i, TPR, FPR, ks, acc, dpd) train_perf = sc.perf_eva(y_train, train_pred, title='train') test_perf = sc.perf_eva(y_test, test_pred, title='test') #保存模型 joblib.dump(lr, 'insight.m')
X_train = train_woe.loc[:, train_woe.columns != 'BAD'] y_test = test_woe.loc[:, 'BAD'] X_test = test_woe.loc[:, train_woe.columns != 'BAD'] # Fit logit model lr = sm.GLM(y_train, X_train, family=sm.families.Binomial()) fit = lr.fit() fit.summary() # Get probabilities train_pred = fit.predict(X_train) test_pred = fit.predict(X_test) # Plot diagnositcs test_perf = sc.perf_eva(y_test, test_pred, title="test", plot_type=['ks']) test_perf = sc.perf_eva(y_test, test_pred, title="test", plot_type=['roc']) class ModelDetails(): def __init__(self, intercept, coefs): """Because the scorecardpy package can only take a model class of LogisticRegression from the scikit-learn package this class is needed to hold the values from the statsmodels package. """ self.intercept_ = [intercept] self.coef_ = [coefs.tolist()] model = ModelDetails(fit.params[0], fit.params[1:])
def evaluate_main(self): writer = pd.ExcelWriter("{}_report.xlsx".format(self.filename)) odds0 = float(self.df_train_woe[self.target_name].value_counts()[1]) / float(self.df_train_woe[self.target_name].value_counts()[0]) b = self.double_score / np.log(2) a = self.base_score + b * np.log(odds0) card = sc.scorecard(self.bins_adj, self.model, self.final_features, points0=self.base_score, odds0=odds0, pdo=self.double_score) card_df = pd.DataFrame(columns=["variable", "bin", "points"]) for key, value in card.items(): card_df = pd.concat([card_df, value]) card_df.to_excel(writer, 'card_result') self.train_pred = self.model.predict_proba(self.df_train_woe[self.final_features])[:, 1] perf = sc.perf_eva(self.df_train_woe[self.target_name], self.train_pred, title="train") print("On train-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_train.png".format(self.filename)) _score = sc.scorecard_ply(self.df_train, card, print_step=0) _score["flag"] = self.df_train_woe[self.target_name] _score["pred"] = self.train_pred _rs = self._get_score_table(_score, a, b) _rs.to_excel(writer, 'train_result') if self.df_test.any().any(): y_test = self.df_test_woe[self.target_name] self.test_pred = self.model.predict_proba(self.df_test_woe[self.final_features])[:, 1] perf = sc.perf_eva(y_test, self.test_pred, title="test") print("On test-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_test.png".format(self.filename)) _score = sc.scorecard_ply(self.df_test, card, print_step=0) _score["flag"] = self.df_test_woe[self.target_name] _score["pred"] = self.test_pred _rs = self._get_score_table(_score, a, b) _rs.to_excel(writer, 'test_result') if self.df_ott.any().any(): y_ott = self.df_ott_woe[self.target_name] self.ott_pred = self.model.predict_proba(self.df_ott_woe[self.final_features])[:, 1] try: perf = sc.perf_eva(y_ott, self.ott_pred , title="ott") print("On ott-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_test.png".format(self.filename)) _score = sc.scorecard_ply(self.df_ott, card, print_step=0) _score["flag"] = self.df_ott_woe[self.target_name] _score["pred"] = self.ott_pred _rs = self._get_score_table(_score,a,b) _rs.to_excel(writer, 'ott_result') except: self.log.info("Cannot caculation the ott data!") importance = {x: y for x, y in zip(self.final_features, self.model.coef_[0])} iv_df = self.iv_df[self.iv_df['variable'].isin(self.final_features)] iv_df["coef"] = iv_df.variable.map(lambda x: importance[x]) iv_df.to_excel(writer, 'feature_importance') writer.close() self.log.info("全部环节结束,请查看相关文件!")