def test_card_combiner_str_not_match(): c = combiner.export() c['C'] = [['A'], ['B'], ['C']] com = Combiner().load(c) bins = com.transform(df) woe_transer = WOETransformer() woe = woe_transer.fit_transform(bins, target) card = ScoreCard( combiner=com, transer=woe_transer, ) with pytest.raises(Exception) as e: # will raise an exception when fitting a card card.fit(woe, target) assert '\'C\' is not matched' in str(e.value)
def test_card_combiner_str_not_match(): c = combiner.export() c['C'] = [['A'], ['B'], ['C']] com = Combiner().set_rules(c) bins = com.transform(df) woe_transer = WOETransformer() woe = woe_transer.fit_transform(bins, target) model = LogisticRegression() model.fit(woe, target) with pytest.raises(Exception) as e: # will raise an exception when create a card card = ScoreCard( combiner=com, transer=woe_transer, model=model, ) assert '\'C\' is not matched' in str(e.value)
def num_bin(df:pd.DataFrame,cols:list=None,target:str='target',specials:list=None, bin_num_limit:int=5,count_distr_limit:float=0.05,sc_method='chimerge', non_mono_cols:list=None,init_bins=10,init_min_samples=0.05,init_method='chi',**kwargs): # 粗分箱,单调检验,分箱结果 if not cols: cols = df.columns.difference([target]).tolist() if specials: specials = {k: specials for k in cols} if not non_mono_cols: non_mono_cols = [] bind, ivd = dict(), dict() t0 = time.process_time() for col in cols: if col in non_mono_cols: bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, bin_num_limit=bin_num_limit, count_distr_limit=count_distr_limit, method=sc_method,print_info=False)[col] ivd[col] = bind[col]['total_iv'].unique()[0] else: c = Combiner() c.fit(X=df[col], y=df[target],n_bins=init_bins,min_samples=init_min_samples,method=init_method,**kwargs) init_points = c.export()[col] breaks_list = monotonous_bin(df=df, col=col, target=target,cutOffPoints=init_points, special_values=specials) bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, breaks_list=breaks_list, bin_num_limit=bin_num_limit,count_distr_limit=count_distr_limit,method=sc_method, print_info=False)[col] ivd[col] = bind[col]['total_iv'].unique()[0] print(f'there are bing {len(cols)} using {int((time.process_time() - t0) * 100 / 60)} seconds') return bind, ivd
def test_combiner_frame(): res = Combiner().fit_transform(df, target) assert res.iloc[404, 1] == 2
def test_combiner_unique_feature(): f = Combiner().fit_transform(uni_feat, target, method = 'chi') assert f[451] == 0
def test_combiner_with_str(): f = Combiner().fit_transform(str_feat, target, method = 'chi') assert f[451] == 0
def test_combiner(): f = Combiner().fit_transform(feature, target, method = 'chi') assert f[451] == 3
def test_combiner_labels_with_empty(): combiner = Combiner().fit(df, 'target', n_bins = 4, empty_separate = True) res = combiner.transform(df, labels = True) assert res.loc[2, 'D'] == '4.nan'
def test_combiner_empty_separate(): combiner = Combiner() bins = combiner.fit_transform(df, 'target', n_bins = 4, empty_separate = True) mask = pd.isna(df['D']) assert (bins['D'][~mask] != 4).all()
def test_combiner_step(): combiner = Combiner().fit(df['A'], method = 'step', n_bins = 4) bins = combiner.export() assert bins['A'][1] == 4.5
def test_combiner_export(): combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4) bins = combiner.export() assert isinstance(bins['B'][0], list)
def test_combiner_labels(): combiner = Combiner().fit(df, target) res = combiner.transform(df, labels = True) assert res.loc[451, 'A'] == '3.[3 ~ 4)'
train = data[data['split'].isin(['Q1', 'Q2', 'Q3'])].drop('split', axis=1) test = data[data['split'].isin(['Q4'])].drop('split', axis=1) train_s, drops = select(train, target='loan_status', iv=0.005, corr=0.8, return_drop=True) test_s = test[train_s.columns] print('IV筛选不通过的特征为:\n', drops['iv'], '\n', 'corr筛选不通过的特征为:\n', drops['corr']) print('处理完成,剩余{}特征'.format(train_s.shape[1]), '\n' * 2) # ---------------------------------------------------------------------------------------------------------------------------------------------------- # 分箱 print('卡方分箱中'.center(60, '—')) comb = Combiner() columns = train_s.columns def combine(data, target, columns=[], exclude=[]): # 精细化分箱 for i in columns[~columns.isin(exclude)]: data_i = pd.concat([data[i], data[target]], axis=1) comb.fit(data_i, y=target, method='chi', min_samples=0.1) bins = comb.export() print(bins) data_c = comb.transform(data_i, labels=True) bin_plot(data_c, x=i, target=target) plt.show() # combine(train_s, target='loan_status', columns=columns, exclude=['loan_status'])
'[5 ~ 8)': 300, '[8 ~ inf)': 400, 'nan': 500, }, 'B': { ','.join(list('ABCD')): 200, ','.join(list('EF')): 400, 'else': 500, }, 'C': { 'A': 200, 'B': 100, }, } combiner = Combiner() bins = combiner.fit_transform(df, target, n_bins=5) woe_transer = WOETransformer() woe = woe_transer.fit_transform(bins, target) # create a score card card = ScoreCard( combiner=combiner, transer=woe_transer, ) card.fit(woe, target) FUZZ_THRESHOLD = 1e-4 TEST_SCORE = pytest.approx(453.58, FUZZ_THRESHOLD)
def test_combiner_select_dtypes(): res = Combiner().fit_transform(df, target, select_dtypes = 'number') assert res.loc[451, 'B'] == 'G'
def test_combiner_exclude(): res = Combiner().fit_transform(df, target, exclude = 'B') assert res.loc[451, 'B'] == 'G'
def test_combiner_target_in_frame_kwargs(): combiner = Combiner().fit(df, y = 'target', n_bins = 4) bins = combiner.export() assert bins['A'][1] == 6