class ModelDetails(): def __init__(self, intercept, coefs): """Because the scorecardpy package can only take a model class of LogisticRegression from the scikit-learn package this class is needed to hold the values from the statsmodels package. """ self.intercept_ = [intercept] self.coef_ = [coefs.tolist()] model = ModelDetails(fit.params[0], fit.params[1:]) # Create scorecard card = sc.scorecard(bins, model, X_train.columns[1:], points0=800, pdo=50) # Create scores train_score = sc.scorecard_ply(train, card, print_step=0) test_score = sc.scorecard_ply(test, card, print_step=0) # Plot scorecard sc.perf_psi(score={ 'train': train_score, 'test': test_score }, label={ 'train': y_train, 'test': y_test })
y_train = train_woe.loc[:,'creditability'] X_train = train_woe.loc[:,train_woe.columns != 'creditability'] y_test = test_woe.loc[:,'creditability'] X_test = test_woe.loc[:,train_woe.columns != 'creditability'] # logistic regression ------ from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1) lr.fit(X_train, y_train) # lr.coef_ # lr.intercept_ # predicted proability train_pred = lr.predict_proba(X_train)[:,1] test_pred = lr.predict_proba(X_test)[:,1] # performance ks & roc ------ train_perf = sc.perf_eva(y_train, train_pred, title = "train") test_perf = sc.perf_eva(y_test, test_pred, title = "test") # score ------ card = sc.scorecard(bins_adj, lr, X_train.columns) # credit score train_score = sc.scorecard_ply(train, card, print_step=0) test_score = sc.scorecard_ply(test, card, print_step=0) # psi sc.perf_psi( score = {'train':train_score, 'test':test_score}, label = {'train':y_train, 'test':y_test} )
def evaluate_main(self): writer = pd.ExcelWriter("{}_report.xlsx".format(self.filename)) odds0 = float(self.df_train_woe[self.target_name].value_counts()[1]) / float(self.df_train_woe[self.target_name].value_counts()[0]) b = self.double_score / np.log(2) a = self.base_score + b * np.log(odds0) card = sc.scorecard(self.bins_adj, self.model, self.final_features, points0=self.base_score, odds0=odds0, pdo=self.double_score) card_df = pd.DataFrame(columns=["variable", "bin", "points"]) for key, value in card.items(): card_df = pd.concat([card_df, value]) card_df.to_excel(writer, 'card_result') self.train_pred = self.model.predict_proba(self.df_train_woe[self.final_features])[:, 1] perf = sc.perf_eva(self.df_train_woe[self.target_name], self.train_pred, title="train") print("On train-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_train.png".format(self.filename)) _score = sc.scorecard_ply(self.df_train, card, print_step=0) _score["flag"] = self.df_train_woe[self.target_name] _score["pred"] = self.train_pred _rs = self._get_score_table(_score, a, b) _rs.to_excel(writer, 'train_result') if self.df_test.any().any(): y_test = self.df_test_woe[self.target_name] self.test_pred = self.model.predict_proba(self.df_test_woe[self.final_features])[:, 1] perf = sc.perf_eva(y_test, self.test_pred, title="test") print("On test-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_test.png".format(self.filename)) _score = sc.scorecard_ply(self.df_test, card, print_step=0) _score["flag"] = self.df_test_woe[self.target_name] _score["pred"] = self.test_pred _rs = self._get_score_table(_score, a, b) _rs.to_excel(writer, 'test_result') if self.df_ott.any().any(): y_ott = self.df_ott_woe[self.target_name] self.ott_pred = self.model.predict_proba(self.df_ott_woe[self.final_features])[:, 1] try: perf = sc.perf_eva(y_ott, self.ott_pred , title="ott") print("On ott-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"], perf["Gini"])) perf["pic"].savefig("{}_test.png".format(self.filename)) _score = sc.scorecard_ply(self.df_ott, card, print_step=0) _score["flag"] = self.df_ott_woe[self.target_name] _score["pred"] = self.ott_pred _rs = self._get_score_table(_score,a,b) _rs.to_excel(writer, 'ott_result') except: self.log.info("Cannot caculation the ott data!") importance = {x: y for x, y in zip(self.final_features, self.model.coef_[0])} iv_df = self.iv_df[self.iv_df['variable'].isin(self.final_features)] iv_df["coef"] = iv_df.variable.map(lambda x: importance[x]) iv_df.to_excel(writer, 'feature_importance') writer.close() self.log.info("全部环节结束,请查看相关文件!")
'NumberRealEstateLoansOrLines_woe', 'NumberOfDependents_woe'], axis=1) train_pred = lr.predict_proba(x_train)[:, 1] test_pred = lr.predict_proba(x_test)[:, 1] # # lr (statsmodels version) # import statsmodels.api as sm # X1 = sm.add_constant(x_train) # logit = sm.Logit(y_train, X1) # result = logit.fit() # print(result.summary()) # X2 = sm.add_constant(x_test) # predict = result.predict(X2) # score ------ card = sc.scorecard(cutoff, lr, x_train.columns, points0=600, odds0=1 / 20, pdo=20, basepoints_eq0=False) column = ['basepoints', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse'] if not (os.path.exists('./data/card')): os.makedirs(os.path.join('./data/card')) for i in card.keys(): card[i].to_csv('./data/card/{}.csv'.format(i), index=False) # performance ks & roc ------ train_perf = sc.perf_eva(y_train, train_pred, title="train") test_perf = sc.perf_eva(y_test, test_pred, title="test") # credit score train_score = sc.scorecard_ply(train_data, card, print_step=0, only_total_score=False) test_score = sc.scorecard_ply(test_data, card, print_step=0, only_total_score=False)
def execute_data(): start = time.time() try: global modelPath, binsPath, bins, model logger.info('入参:%s', str(request.get_data())) request_data = request.get_json() #获取传入数据 paramsJson = request_data['paramsData'] modelFilePath = modelFilePathCheck(request_data) if (type(paramsJson) == type({})): #原始数据中数值型字符串转为int64 items = paramsJson.items() for key, value in items: if (is_number(value)): paramsJson[key] = int(value) logger.info("paramsJson:%s", paramsJson) #构建dataFrame df = pd.json_normalize(paramsJson) if (modelPath != modelFilePath): logger.info('调用模型路径发生变化,重新加载模型!') logger.info('global modelFilePath:%s', modelPath) logger.info('param modelFilePath:%s', modelFilePath) modelPath = modelFilePath #导入模型 model = joblib.load(modelFilePath) else: logger.info('调用模型路径未发生变化,使用缓存中模型。') logger.info('global modelFilePath:%s', modelPath) #原始数据转换为woe值 bins = model.bins df_woe = sc.woebin_ply(df, bins) #打标签 label = model.predict(df_woe)[0] #构建评分卡 card = sc.scorecard(bins, model, df_woe.keys()) #评分 score = sc.scorecard_ply(df, card, only_total_score=False, print_step=0) #计算每个特征的得分 featureScore = {} # featureScore = calculateFeatures(df, card) if isinstance(card, dict): card_df = pd.concat(card, ignore_index=True) elif isinstance(card, pd.DataFrame): card_df = card.copy(deep=True) # x variables xs = card_df.loc[card_df.variable != 'basepoints', 'variable'].unique() for i in xs: featureScore[i] = score[i + '_points'][0] result = {} result['code'] = '00000' result['score'] = str(score['score'][0]) result['label'] = str(label) result['featureScore'] = featureScore end = time.time() logger.info("运行结果:%s,模型执行耗时:%s", result, end - start) return jsonify(result) code10002['errorMsg'] = '输入值错误:请传入json格式参数' return jsonify(code10002) except KeyError as e: logger.info(e) code10001['errorMsg'] = '输入特征错误:' + str(e) return jsonify(code10001) except ValueError as e: logger.info(e) code10002['errorMsg'] = '输入值错误:' + str(e) return jsonify(code10002) except Exception as e: logger.info(e) code10003['errorMsg'] = '未知错误:' + str(e) return jsonify(code10003)