def test_quality_object_type_array_with_nan(): feature = np.array([np.nan, 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='O')[mask] df = pd.DataFrame({ 'feature': feature, 'target': target, }) result = quality(df) assert result.loc['feature', 'iv'] == 0.01637933818053033
def test_quality(): result = quality(df, 'target') assert result.loc['feature', 'iv'] == 0.5313391779453922 assert result.loc['A', 'gini'] == 0.49284164671885444 assert result.loc['B', 'entropy'] == 0.6924956879070063 assert result.loc['feature', 'unique'] == 500
def test_quality_iv_only(): result = quality(df, 'target', iv_only=True) assert np.isnan(result.loc['feature', 'gini'])
def data_quality(self): if not self._quality_data: self._quality_data = toad.quality(self._target_data, target=self._target_data) return self._quality_data
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] #PSI[f'fold_{fold_n + 1}'] = toad.metrics.PSI(X_train.drop(cats,axis=1),X_valid.drop(cats,axis=1)).values PSI[f'fold_{fold_n + 1}'] = toad.metrics.PSI(X_train.TransactionAmt, X_valid.TransactionAmt) del X_train, X_valid, y_train, y_valid gc.collect() return PSI.drop('feature', axis=1).iloc[0].mean() # IV & PSI before binning print('Class: TransactionAmt') print('Original Information Value: ', toad.quality(X[['TransactionAmt', 'y']], 'y').iv.values[0]) print('Original Population Stability Index: ', PSI_cal(5, X, y, 'TransactionAmt')) # Binning - cart tree iv = [] PSIs = [] TransactionAmt = X.TransactionAmt for bins in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]: bins = toad.DTMerge(TransactionAmt, y, n_bins=bins).tolist() bins.insert(0, -np.inf) bins.append(np.inf) X.TransactionAmt = np.digitize(TransactionAmt, bins) iv.append(toad.quality(X[['TransactionAmt', 'y']], 'y').iv.values[0]) PSIs.append(PSI_cal(5, X, y, cats))
def evaluate(test_data, excel_name='report.xlsx', num=10, iv_threshold_value=0.02, unique_num=20, overdue_days=False, self_data=None): # 测试数据iv等信息并写入excel workbook = xlsxwriter.Workbook(excel_name) quality = toad.quality(test_data.drop(columns=["loan_apply_no"]), target="target") quality.sort_values(by='iv', ascending=False, inplace=True) quality['var_name'] = quality.index draw_data(workbook, workbook.add_worksheet('quality'), quality, start_index=0, title='变量探查结果', columns=['var_name', 'iv', 'gini', 'entropy', 'unique']) print("quality 计算完毕") # 选择高iv的测试变量分组并写入excel quality = quality.replace("--", -1) high_iv_var = select_iv(quality, num, iv_threshold_value) high_iv_var_list = high_iv_var.index.tolist() all_var_pic(high_iv_var_list, test_data, workbook, workbook.add_worksheet("分布"), 0, unique_num) print("变量分组处理完毕") all_data = None if overdue_days or self_data is not None: all_data = pd.merge(self_data, test_data.drop(columns=['target']), how='left', on='loan_apply_no') if overdue_days: # 数据合并 test_var = test_data.columns.tolist() test_var.remove("loan_apply_no") test_var.remove("target") save_overdue_crosstab(all_data, test_var, workbook, workbook.add_worksheet("逾期天数"), 0, unique_num=unique_num) print("逾期天数处理完毕") if self_data is not None: # train和test数据集划分 data_train = all_data[all_data['SMP_N'] == 'dev_smp'] data_test = all_data[all_data['SMP_N'] == 'ver_smp'] # 离散变量做woe处理 data_train, data_test = replace_with_woe( data_train, data_test, exclude_var=['SMP_N', 'loan_apply_no'], target='target') # 循环加入变量建模并计算模型的ks最后写入excel self_data_var_list = self_data.columns.tolist() drop_var = ['loan_apply_no', 'SMP_N', 'target'] for var in drop_var: self_data_var_list.remove(var) if 'overdue_days' in self_data_var_list: self_data_var_list.remove('overdue_days') model, param = param_opt(data_train[self_data_var_list], data_train['target']) train_ks = get_ks(model, data_train[self_data_var_list], data_train['target']) test_ks = get_ks(model, data_test[self_data_var_list], data_test['target']) result = pd.DataFrame(columns=['train_ks', 'test_ks', 'var']) result = result.append( { 'train_ks': train_ks, 'test_ks': test_ks, 'var': '[]' }, ignore_index=True) result = compute_ks(data_train, data_test, quality, self_data_var_list, test_ks, result, target='target') draw_data(workbook, workbook.add_worksheet("ks"), result, start_index=0, title='ks测试结果', columns=result.columns.tolist()) print("模型计算完毕") workbook.close()
def get_quality(self, target='target', iv_only=False, **kwargs): report = quality(self.df, target, iv_only, **kwargs) return report
train_data, test_data = get_train_test_data() # 返回每个特性的EDA报告,包括数据类型、分布、缺失率和惟一值。 toad_detector = toad.detector.detect(train_data) # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。 selected_data, drop_lst = toad.selection.select(train_data, target='label', empty=0.5, iv=0.05, corr=0.9, return_drop=True, exclude=['phone_no_m']) # 返回每个特征的质量,包括iv、基尼系数和熵。可以帮助我们发现更有用的潜在信息。 quality = toad.quality(selected_data, 'label') # 对数值型和类别型变量进行分箱,支持决策树分箱、卡方分箱、最优分箱等 # 初始化一个combiner类 combiner = toad.transform.Combiner() # 训练数据并指定分箱方法,其它参数可选。分箱阈值的方法(method) 包括:'chi','dt','quantile','step','kmeans' combiner.fit(selected_data, y='label', method='chi', min_samples=0.05, exclude='phone_no_m') # 以字典形式保存分箱结果 bins = combiner.export() # 查看某特征的分箱区间值 print(bins["arpu"]) # 进行分箱转化