def extrapolation(self, X_kgb, y_kgb, X_igb):
        """
        外推法迭代
        :param X_kgb: 通过样本数据
        :param y_kgb: 通过样本标签
        :param X_igb: 拒绝样本数据
        :return:
        """
        IK_down, IK_up, k = self.method_dict['IK_down'], self.method_dict[
            'IK_up'], self.method_dict['k']
        flag = self.method_dict['flag']
        interval = [IK_down + (IK_up - IK_down) / k * i for i in range(k)]

        KS_max = 0
        n = 0
        X_agb, y_agb = pd.DataFrame(), pd.DataFrame()
        while True:
            pred_kgb = self.estimator.predict_proba(X_kgb)[:, 1]
            ks, _ = calc_ks(y_kgb, pred_kgb)
            if KS_max > ks:
                break
            else:
                print('第{}轮ks值为{}'.format(n, ks))
                KS_max = ks
                n += 1

            pred_igb = self.estimator.predict_proba(X_igb)[:, 1]
            # 通过样本分组,并对拒绝样本使用相同分组
            _, bins = pd.qcut(pred_kgb, k, duplicates='drop', retbins=True)
            kgb_cut = pd.cut(pred_kgb, bins=bins)
            igb_cut = pd.cut(pred_igb, bins=bins)
            kgb_bad_rate = y_kgb.groupby(kgb_cut).apply(
                lambda x: x.sum() / len(x))
            igb_bad_rate = kgb_bad_rate * interval

            # 对拒绝样本进行打标签
            X_igb_copy = X_igb.copy()
            X_igb_copy['cut'] = igb_cut
            X_igb_copy['pred'] = pred_igb
            tmp = pd.DataFrame()
            for name, group in X_igb_copy.groupby('cut'):
                bad_rate = igb_bad_rate[name]
                bad_num = np.ceil(group.shape[0] * bad_rate)
                group = group.sort_values(by='pred',
                                          ascending=False,
                                          ignore_index=True)
                group['y'] = 1
                if flag:
                    group = group.iloc[:bad_num]
                else:
                    group.iloc[bad_num:, -1] = 0
                tmp = tmp.append(group)

            X_agb = pd.concat([X_kgb, tmp.iloc[:, :-3]], ignore_index=True)
            y_agb = pd.concat([y_kgb, tmp.loc[:, 'y']], ignore_index=True)
            self.estimator.fit(X_agb, y_agb)

        if self.file_path is not None:
            pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path,
                                                     index=False)
    def hard_cutoff(self, X_kgb, y_kgb, X_igb):
        """
        硬截断法迭代
        :param X_kgb: 通过样本数据
        :param y_kgb: 通过样本标签
        :param X_igb: 拒绝样本数据
        :return:
        """
        IK, flag = self.method_dict['IK'], self.method_dict['flag']

        # 计算拒绝样本好坏样本数量
        bad_rate_kgb = y_kgb.sum() / len(y_kgb)
        bad_rate_igb = bad_rate_kgb * IK
        bad_num_igb = np.ceil(len(X_igb) * bad_rate_igb)
        good_num_igb = len(X_igb) - bad_num_igb

        KS_max = 0
        n = 0
        X_agb, y_agb = pd.DataFrame(), pd.DataFrame()
        while True:
            pred_kgb = self.estimator.predict_proba(X_kgb)[:, 1]
            ks, _ = calc_ks(y_kgb, pred_kgb)
            if KS_max > ks:
                break
            else:
                print('第{}轮ks值为{}'.format(n, ks))
                KS_max = ks
                n += 1

            pred_igb = self.estimator.predict_proba(X_igb)[:, 1]
            idx_sort = np.argsort(-pred_igb)
            if flag:
                X_agb = pd.concat([X_kgb, X_igb.iloc[idx_sort[:bad_num_igb]]],
                                  ignore_index=True)
                y_agb = pd.concat(
                    [y_kgb, pd.Series(np.ones(bad_num_igb))],
                    ignore_index=True)
            else:
                X_agb = pd.concat([X_kgb, X_igb.iloc[idx_sort]],
                                  ignore_index=True)
                y_agb = pd.concat([
                    y_kgb,
                    pd.Series(np.ones(bad_num_igb)),
                    pd.Series(np.zeros(good_num_igb))
                ],
                                  ignore_index=True)
            self.estimator.fit(X_agb, y_agb)

        if self.file_path is not None:
            pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path,
                                                     index=False)
    def fuzzy_augmentation(self, X_kgb, y_kgb, X_igb):
        """
        模糊展开法迭代
        :param X_kgb: 通过样本数据
        :param y_kgb: 通过样本标签
        :param X_igb: 拒绝样本数据
        :return:
        """
        KS_max = 0
        n = 0
        X_kgb['weight'] = 1
        X_agb, y_agb = pd.DataFrame(), pd.DataFrame()
        while True:
            pred_kgb = self.estimator.predict_proba(X_kgb.iloc[:, :-1])[:, 1]
            ks, _ = calc_ks(y_kgb, pred_kgb)
            if KS_max > ks:
                break
            else:
                print('第{}轮ks值为{}'.format(n, ks))
                KS_max = ks
                n += 1

            X_igb_bad, X_igb_good = X_igb.copy(), X_igb.copy()
            X_igb_bad['weight'] = self.estimator.predict_proba(X_igb)[:, 1]
            X_igb_good['weight'] = self.estimator.predict_proba(X_igb)[:, 0]

            X_agb = pd.concat([X_kgb, X_igb_good, X_igb_bad],
                              ignore_index=True)
            y_agb = pd.concat([
                y_kgb,
                pd.Series(np.zeros(len(X_igb))),
                pd.Series(np.ones(len(X_igb)))
            ],
                              ignore_index=True)

            self.estimator.fit(X_agb.iloc[:, :-1],
                               y_agb,
                               sample_weight=X_agb['weight'])

        if self.file_path is not None:
            pd.concat([X_agb, y_agb], axis=1).to_csv(self.file_path,
                                                     index=False)
Example #4
0
    def _get_woe_iv(self, X: Series, y: Series, col_name):
        """
        计算每个分箱指标
        :param X: 单个变量数据
        :param y: 标签数据
        :param col_name: 变量列名
        :return: woe列表,iv值
        """
        is_num = self.features_info[col_name]
        nan_flag = self.features_bins[col_name]['flag']
        bins = self.features_bins[col_name]['bins']
        B = y.sum()
        G = y.size - B
        b_bins = []
        g_bins = []
        col_ks = None
        col_gini = None

        if self.is_ks == 1:
            col_gini = calc_gini(y, X)
        if self.is_gini == 1:
            col_ks, _ = calc_ks(y, X)

        if nan_flag == 1:
            mask = X.isin(bins[0])
            b_bins.append(y[mask].sum())
            g_bins.append(mask.sum() - y[mask].sum())
            bins = bins[1:]
            X = X[~mask]
            y = y[~mask]

        if is_num:
            for left, right in bins:
                mask = (X > left) & (X <= right)
                b_bins.append(y[mask].sum())
                g_bins.append(mask.sum() - y[mask].sum())
        else:
            for v in bins:
                mask = X.isin(v)
                b_bins.append(y[mask].sum())
                g_bins.append(mask.sum() - y[mask].sum())

        b_bins = np.array(b_bins)
        g_bins = np.array(g_bins)
        count_bins = b_bins + g_bins
        woes = woe_single_all(B, G, b_bins, g_bins).tolist()
        temp = (b_bins + __SMOOTH__) / (B + __SMOOTH__) - (
            g_bins + __SMOOTH__) / (G + __SMOOTH__)
        iv = float(np.around((temp * woes).sum(), 6))

        self.features_bins[col_name]['counts'] = count_bins
        self.features_bins[col_name]['bads'] = b_bins
        self.features_bins[col_name]['woes'] = woes
        self.features_bins[col_name]['iv'] = iv

        self.features_df['col_name'].extend([col_name] * b_bins.size)
        self.features_df['bin'].extend(bins)
        self.features_df['bad'].extend(b_bins)
        self.features_df['count'].extend(count_bins)
        self.features_df['rate'].extend(b_bins / count_bins)
        self.features_df['woe'].extend(woes)
        self.features_df['iv'].extend([iv] * b_bins.size)
        if self.is_ks == 1:
            self.features_df['gini'].extend([col_gini] * b_bins.size)
        if self.is_gini == 1:
            self.features_df['ks'].extend([col_ks] * b_bins.size)
Example #5
0
        "max_iter": 100,
        "penalty": "l2",
        "C": 1.0,
        "random_state": 0
    }
    model = BasicTrainer(algorithm='lr', params=params)
    model.fit(train_x, train_y)
    train_pred = model.estimator.predict_proba(train_x)[:, -1]
    val_pred = model.estimator.predict_proba(val_x)[:, -1]
    test_pred = model.estimator.predict_proba(test_x)[:, -1]

    # 获取入模变量相关信息字典
    bins_info = {}
    for col in seven_feats:
        bins_info[col] = {}
        bins_info[col]['bins'] = DT.features_bins[col]['bins']
        bins_info[col]['woes'] = DT.features_woes[col]
        bins_info[col]['flag'] = DT.features_bins[col]['flag']
        bins_info[col]['type'] = DT.features_info[col]
    sc = ScoreStretch(S=ori_data['y'].sum() / ori_data.shape[0],
                      pred=train_pred)
    sc.transform_pred_to_score(test_pred)
    sc.transform_data_to_score(test_x, model.estimator)

    val_ks = calc_ks(val_y, val_pred)
    test_ks = calc_ks(test_y, test_pred)
    val_auc = calc_auc(val_y, val_pred)
    test_auc = calc_auc(test_y, test_pred)
    print('验证集auc={}, ks={}'.format(val_auc, val_ks))
    print('测试集auc={}, ks={}'.format(test_auc, test_ks))