Esempio n. 1
0
        :param count: DataFrame 待分箱变量各取值的正负样本数
        :param t_num_0:
        :param t_num_1:
        :return: 该分箱的ks值
        计算公式:KS_i = |sum_i / sum_T - (size_i - sum_i)/ (size_T - sum_T)|
        """

        # 计算左评分区间的累计好账户数占总好账户数比率(good %)和累计坏账户数占总坏账户数比率(bad %)。
        good_left = count[1].sum() / t_num_1 if count[1].sum() != 0 else 1
        bad_left = count[0].sum() / t_num_0 if count[0].sum() != 0 else 1

        return abs(good_left - bad_left)


if __name__ == '__main__':

    e_data = ExampleData()

    df = e_data.get_iris2()

    print(df.head())

    me_bin = BestDistinguishBinning('y', 'ks')

    me_bin.max_bin = 4

    # count = pd.crosstab(df['sepal_width'], df['y'])

    # print(MinEntropyBinning.calc_entropy(count))

    print('切分点集合', me_bin.calc(df, 'sepal_width'))
Esempio n. 2
0
# @Author  : HD
import sys
from io import StringIO

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from utils.example_data import ExampleData

if __name__ == '__main__':
    """
	截取,模型训练的中间过程。
	"""

    edd = ExampleData()

    df = edd.get_barest_cancer_data()
    cols = df.columns
    old_stdout = sys.stdout
    sys.stdout = mystdout = StringIO()
    clf = GradientBoostingRegressor(verbose=1)
    clf.fit(df[cols[:-2]], df[cols[-1]])
    sys.stdout = old_stdout
    loss_history = mystdout.getvalue()

    print(type(loss_history))
    # for i in loss_history.split('\n'):
    # 	print(i, len(i))
    print([{
        'epoch': j[7:10].strip(' '),