def extrapolation(self, X_kgb, y_kgb, X_igb): """ 外推法迭代 :param X_kgb: 通过样本数据 :param y_kgb: 通过样本标签 :param X_igb: 拒绝样本数据 :return: """ IK_down, IK_up, k = self.method_dict['IK_down'], self.method_dict[ 'IK_up'], self.method_dict['k'] flag = self.method_dict['flag'] interval = [IK_down + (IK_up - IK_down) / k * i for i in range(k)] KS_max = 0 n = 0 X_agb, y_agb = pd.DataFrame(), pd.DataFrame() while True: pred_kgb = self.estimator.predict_proba(X_kgb)[:, 1] ks, _ = calc_ks(y_kgb, pred_kgb) if KS_max > ks: break else: print('第{}轮ks值为{}'.format(n, ks)) KS_max = ks n += 1 pred_igb = self.estimator.predict_proba(X_igb)[:, 1] # 通过样本分组,并对拒绝样本使用相同分组 _, bins = pd.qcut(pred_kgb, k, duplicates='drop', retbins=True) kgb_cut = pd.cut(pred_kgb, bins=bins) igb_cut = pd.cut(pred_igb, bins=bins) kgb_bad_rate = y_kgb.groupby(kgb_cut).apply( lambda x: x.sum() / len(x)) igb_bad_rate = kgb_bad_rate * interval # 对拒绝样本进行打标签 X_igb_copy = X_igb.copy() X_igb_copy['cut'] = igb_cut X_igb_copy['pred'] = pred_igb tmp = pd.DataFrame() for name, group in X_igb_copy.groupby('cut'): bad_rate = igb_bad_rate[name] bad_num = np.ceil(group.shape[0] * bad_rate) group = group.sort_values(by='pred', ascending=False, ignore_index=True) group['y'] = 1 if flag: group = group.iloc[:bad_num] else: group.iloc[bad_num:, -1] = 0 tmp = tmp.append(group) X_agb = pd.concat([X_kgb, tmp.iloc[:, :-3]], ignore_index=True) y_agb = pd.concat([y_kgb, tmp.loc[:, 'y']], ignore_index=True) self.estimator.fit(X_agb, y_agb) if self.file_path is not None: pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path, index=False)
def hard_cutoff(self, X_kgb, y_kgb, X_igb): """ 硬截断法迭代 :param X_kgb: 通过样本数据 :param y_kgb: 通过样本标签 :param X_igb: 拒绝样本数据 :return: """ IK, flag = self.method_dict['IK'], self.method_dict['flag'] # 计算拒绝样本好坏样本数量 bad_rate_kgb = y_kgb.sum() / len(y_kgb) bad_rate_igb = bad_rate_kgb * IK bad_num_igb = np.ceil(len(X_igb) * bad_rate_igb) good_num_igb = len(X_igb) - bad_num_igb KS_max = 0 n = 0 X_agb, y_agb = pd.DataFrame(), pd.DataFrame() while True: pred_kgb = self.estimator.predict_proba(X_kgb)[:, 1] ks, _ = calc_ks(y_kgb, pred_kgb) if KS_max > ks: break else: print('第{}轮ks值为{}'.format(n, ks)) KS_max = ks n += 1 pred_igb = self.estimator.predict_proba(X_igb)[:, 1] idx_sort = np.argsort(-pred_igb) if flag: X_agb = pd.concat([X_kgb, X_igb.iloc[idx_sort[:bad_num_igb]]], ignore_index=True) y_agb = pd.concat( [y_kgb, pd.Series(np.ones(bad_num_igb))], ignore_index=True) else: X_agb = pd.concat([X_kgb, X_igb.iloc[idx_sort]], ignore_index=True) y_agb = pd.concat([ y_kgb, pd.Series(np.ones(bad_num_igb)), pd.Series(np.zeros(good_num_igb)) ], ignore_index=True) self.estimator.fit(X_agb, y_agb) if self.file_path is not None: pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path, index=False)
def fuzzy_augmentation(self, X_kgb, y_kgb, X_igb): """ 模糊展开法迭代 :param X_kgb: 通过样本数据 :param y_kgb: 通过样本标签 :param X_igb: 拒绝样本数据 :return: """ KS_max = 0 n = 0 X_kgb['weight'] = 1 X_agb, y_agb = pd.DataFrame(), pd.DataFrame() while True: pred_kgb = self.estimator.predict_proba(X_kgb.iloc[:, :-1])[:, 1] ks, _ = calc_ks(y_kgb, pred_kgb) if KS_max > ks: break else: print('第{}轮ks值为{}'.format(n, ks)) KS_max = ks n += 1 X_igb_bad, X_igb_good = X_igb.copy(), X_igb.copy() X_igb_bad['weight'] = self.estimator.predict_proba(X_igb)[:, 1] X_igb_good['weight'] = self.estimator.predict_proba(X_igb)[:, 0] X_agb = pd.concat([X_kgb, X_igb_good, X_igb_bad], ignore_index=True) y_agb = pd.concat([ y_kgb, pd.Series(np.zeros(len(X_igb))), pd.Series(np.ones(len(X_igb))) ], ignore_index=True) self.estimator.fit(X_agb.iloc[:, :-1], y_agb, sample_weight=X_agb['weight']) if self.file_path is not None: pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path, index=False)
def _get_woe_iv(self, X: Series, y: Series, col_name): """ 计算每个分箱指标 :param X: 单个变量数据 :param y: 标签数据 :param col_name: 变量列名 :return: woe列表,iv值 """ is_num = self.features_info[col_name] nan_flag = self.features_bins[col_name]['flag'] bins = self.features_bins[col_name]['bins'] B = y.sum() G = y.size - B b_bins = [] g_bins = [] col_ks = None col_gini = None if self.is_ks == 1: col_gini = calc_gini(y, X) if self.is_gini == 1: col_ks, _ = calc_ks(y, X) if nan_flag == 1: mask = X.isin(bins[0]) b_bins.append(y[mask].sum()) g_bins.append(mask.sum() - y[mask].sum()) bins = bins[1:] X = X[~mask] y = y[~mask] if is_num: for left, right in bins: mask = (X > left) & (X <= right) b_bins.append(y[mask].sum()) g_bins.append(mask.sum() - y[mask].sum()) else: for v in bins: mask = X.isin(v) b_bins.append(y[mask].sum()) g_bins.append(mask.sum() - y[mask].sum()) b_bins = np.array(b_bins) g_bins = np.array(g_bins) count_bins = b_bins + g_bins woes = woe_single_all(B, G, b_bins, g_bins).tolist() temp = (b_bins + __SMOOTH__) / (B + __SMOOTH__) - ( g_bins + __SMOOTH__) / (G + __SMOOTH__) iv = float(np.around((temp * woes).sum(), 6)) self.features_bins[col_name]['counts'] = count_bins self.features_bins[col_name]['bads'] = b_bins self.features_bins[col_name]['woes'] = woes self.features_bins[col_name]['iv'] = iv self.features_df['col_name'].extend([col_name] * b_bins.size) self.features_df['bin'].extend(bins) self.features_df['bad'].extend(b_bins) self.features_df['count'].extend(count_bins) self.features_df['rate'].extend(b_bins / count_bins) self.features_df['woe'].extend(woes) self.features_df['iv'].extend([iv] * b_bins.size) if self.is_ks == 1: self.features_df['gini'].extend([col_gini] * b_bins.size) if self.is_gini == 1: self.features_df['ks'].extend([col_ks] * b_bins.size)
"max_iter": 100, "penalty": "l2", "C": 1.0, "random_state": 0 } model = BasicTrainer(algorithm='lr', params=params) model.fit(train_x, train_y) train_pred = model.estimator.predict_proba(train_x)[:, -1] val_pred = model.estimator.predict_proba(val_x)[:, -1] test_pred = model.estimator.predict_proba(test_x)[:, -1] # 获取入模变量相关信息字典 bins_info = {} for col in seven_feats: bins_info[col] = {} bins_info[col]['bins'] = DT.features_bins[col]['bins'] bins_info[col]['woes'] = DT.features_woes[col] bins_info[col]['flag'] = DT.features_bins[col]['flag'] bins_info[col]['type'] = DT.features_info[col] sc = ScoreStretch(S=ori_data['y'].sum() / ori_data.shape[0], pred=train_pred) sc.transform_pred_to_score(test_pred) sc.transform_data_to_score(test_x, model.estimator) val_ks = calc_ks(val_y, val_pred) test_ks = calc_ks(test_y, test_pred) val_auc = calc_auc(val_y, val_pred) test_auc = calc_auc(test_y, test_pred) print('验证集auc={}, ks={}'.format(val_auc, val_ks)) print('测试集auc={}, ks={}'.format(test_auc, test_ks))