Ejemplo n.º 1
0
def test_quality_object_type_array_with_nan():
    feature = np.array([np.nan, 'A', 'B', 'C', 'D', 'E', 'F', 'G'],
                       dtype='O')[mask]

    df = pd.DataFrame({
        'feature': feature,
        'target': target,
    })
    result = quality(df)
    assert result.loc['feature', 'iv'] == 0.01637933818053033
Ejemplo n.º 2
0
def test_quality():
    result = quality(df, 'target')
    assert result.loc['feature', 'iv'] == 0.5313391779453922
    assert result.loc['A', 'gini'] == 0.49284164671885444
    assert result.loc['B', 'entropy'] == 0.6924956879070063
    assert result.loc['feature', 'unique'] == 500
Ejemplo n.º 3
0
def test_quality_iv_only():
    result = quality(df, 'target', iv_only=True)
    assert np.isnan(result.loc['feature', 'gini'])
Ejemplo n.º 4
0
 def data_quality(self):
     if not self._quality_data:
         self._quality_data = toad.quality(self._target_data, target=self._target_data)
     return self._quality_data
Ejemplo n.º 5
0
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        #PSI[f'fold_{fold_n + 1}'] = toad.metrics.PSI(X_train.drop(cats,axis=1),X_valid.drop(cats,axis=1)).values

        PSI[f'fold_{fold_n + 1}'] = toad.metrics.PSI(X_train.TransactionAmt,
                                                     X_valid.TransactionAmt)
        del X_train, X_valid, y_train, y_valid
        gc.collect()
    return PSI.drop('feature', axis=1).iloc[0].mean()


# IV & PSI before binning

print('Class: TransactionAmt')
print('Original Information Value: ',
      toad.quality(X[['TransactionAmt', 'y']], 'y').iv.values[0])
print('Original Population Stability Index: ',
      PSI_cal(5, X, y, 'TransactionAmt'))

# Binning - cart tree
iv = []
PSIs = []
TransactionAmt = X.TransactionAmt
for bins in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    bins = toad.DTMerge(TransactionAmt, y, n_bins=bins).tolist()
    bins.insert(0, -np.inf)
    bins.append(np.inf)
    X.TransactionAmt = np.digitize(TransactionAmt, bins)
    iv.append(toad.quality(X[['TransactionAmt', 'y']], 'y').iv.values[0])
    PSIs.append(PSI_cal(5, X, y, cats))
Ejemplo n.º 6
0
def evaluate(test_data,
             excel_name='report.xlsx',
             num=10,
             iv_threshold_value=0.02,
             unique_num=20,
             overdue_days=False,
             self_data=None):
    # 测试数据iv等信息并写入excel
    workbook = xlsxwriter.Workbook(excel_name)
    quality = toad.quality(test_data.drop(columns=["loan_apply_no"]),
                           target="target")
    quality.sort_values(by='iv', ascending=False, inplace=True)
    quality['var_name'] = quality.index
    draw_data(workbook,
              workbook.add_worksheet('quality'),
              quality,
              start_index=0,
              title='变量探查结果',
              columns=['var_name', 'iv', 'gini', 'entropy', 'unique'])
    print("quality 计算完毕")
    # 选择高iv的测试变量分组并写入excel
    quality = quality.replace("--", -1)
    high_iv_var = select_iv(quality, num, iv_threshold_value)
    high_iv_var_list = high_iv_var.index.tolist()
    all_var_pic(high_iv_var_list, test_data, workbook,
                workbook.add_worksheet("分布"), 0, unique_num)
    print("变量分组处理完毕")
    all_data = None
    if overdue_days or self_data is not None:
        all_data = pd.merge(self_data,
                            test_data.drop(columns=['target']),
                            how='left',
                            on='loan_apply_no')
    if overdue_days:
        # 数据合并
        test_var = test_data.columns.tolist()
        test_var.remove("loan_apply_no")
        test_var.remove("target")
        save_overdue_crosstab(all_data,
                              test_var,
                              workbook,
                              workbook.add_worksheet("逾期天数"),
                              0,
                              unique_num=unique_num)
        print("逾期天数处理完毕")
    if self_data is not None:
        # train和test数据集划分
        data_train = all_data[all_data['SMP_N'] == 'dev_smp']
        data_test = all_data[all_data['SMP_N'] == 'ver_smp']
        # 离散变量做woe处理
        data_train, data_test = replace_with_woe(
            data_train,
            data_test,
            exclude_var=['SMP_N', 'loan_apply_no'],
            target='target')
        # 循环加入变量建模并计算模型的ks最后写入excel
        self_data_var_list = self_data.columns.tolist()
        drop_var = ['loan_apply_no', 'SMP_N', 'target']
        for var in drop_var:
            self_data_var_list.remove(var)
        if 'overdue_days' in self_data_var_list:
            self_data_var_list.remove('overdue_days')
        model, param = param_opt(data_train[self_data_var_list],
                                 data_train['target'])
        train_ks = get_ks(model, data_train[self_data_var_list],
                          data_train['target'])
        test_ks = get_ks(model, data_test[self_data_var_list],
                         data_test['target'])
        result = pd.DataFrame(columns=['train_ks', 'test_ks', 'var'])
        result = result.append(
            {
                'train_ks': train_ks,
                'test_ks': test_ks,
                'var': '[]'
            },
            ignore_index=True)
        result = compute_ks(data_train,
                            data_test,
                            quality,
                            self_data_var_list,
                            test_ks,
                            result,
                            target='target')
        draw_data(workbook,
                  workbook.add_worksheet("ks"),
                  result,
                  start_index=0,
                  title='ks测试结果',
                  columns=result.columns.tolist())
        print("模型计算完毕")
    workbook.close()
Ejemplo n.º 7
0
 def get_quality(self, target='target', iv_only=False, **kwargs):
     report = quality(self.df, target, iv_only, **kwargs)
     return report
Ejemplo n.º 8
0
    train_data, test_data = get_train_test_data()

    # 返回每个特性的EDA报告,包括数据类型、分布、缺失率和惟一值。
    toad_detector = toad.detector.detect(train_data)

    # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。
    selected_data, drop_lst = toad.selection.select(train_data,
                                                    target='label',
                                                    empty=0.5,
                                                    iv=0.05,
                                                    corr=0.9,
                                                    return_drop=True,
                                                    exclude=['phone_no_m'])

    # 返回每个特征的质量,包括iv、基尼系数和熵。可以帮助我们发现更有用的潜在信息。
    quality = toad.quality(selected_data, 'label')

    # 对数值型和类别型变量进行分箱,支持决策树分箱、卡方分箱、最优分箱等
    # 初始化一个combiner类
    combiner = toad.transform.Combiner()
    # 训练数据并指定分箱方法,其它参数可选。分箱阈值的方法(method) 包括:'chi','dt','quantile','step','kmeans'
    combiner.fit(selected_data,
                 y='label',
                 method='chi',
                 min_samples=0.05,
                 exclude='phone_no_m')
    # 以字典形式保存分箱结果
    bins = combiner.export()
    # 查看某特征的分箱区间值
    print(bins["arpu"])
    # 进行分箱转化