:param count: DataFrame 待分箱变量各取值的正负样本数 :param t_num_0: :param t_num_1: :return: 该分箱的ks值 计算公式:KS_i = |sum_i / sum_T - (size_i - sum_i)/ (size_T - sum_T)| """ # 计算左评分区间的累计好账户数占总好账户数比率(good %)和累计坏账户数占总坏账户数比率(bad %)。 good_left = count[1].sum() / t_num_1 if count[1].sum() != 0 else 1 bad_left = count[0].sum() / t_num_0 if count[0].sum() != 0 else 1 return abs(good_left - bad_left) if __name__ == '__main__': e_data = ExampleData() df = e_data.get_iris2() print(df.head()) me_bin = BestDistinguishBinning('y', 'ks') me_bin.max_bin = 4 # count = pd.crosstab(df['sepal_width'], df['y']) # print(MinEntropyBinning.calc_entropy(count)) print('切分点集合', me_bin.calc(df, 'sepal_width'))
# @Author : HD import sys from io import StringIO from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import GradientBoostingRegressor from utils.example_data import ExampleData if __name__ == '__main__': """ 截取,模型训练的中间过程。 """ edd = ExampleData() df = edd.get_barest_cancer_data() cols = df.columns old_stdout = sys.stdout sys.stdout = mystdout = StringIO() clf = GradientBoostingRegressor(verbose=1) clf.fit(df[cols[:-2]], df[cols[-1]]) sys.stdout = old_stdout loss_history = mystdout.getvalue() print(type(loss_history)) # for i in loss_history.split('\n'): # print(i, len(i)) print([{ 'epoch': j[7:10].strip(' '),