Beispiel #1
0
    br_encoding_dict[col] = br_encoding['bad_rate']
    num_features.append(col + '_br_encoding')

file2 = open(folderOfData + 'br_encoding_dict.pkl', 'wb+')
pickle.dump(br_encoding_dict, file2)
file2.close()

# (iii)对连续型变量进行分箱,包括(ii)中的变量
continous_merged_dict = {}
for col in num_features:
    print("{} is in processing".format(col))
    if -1 not in set(trainData[col]):  # -1会当成特殊值处理。如果没有-1,则所有取值都参与分箱
        max_interval = 5  # 分箱后的最多的箱数
        cutOff = scorecard_function.ChiMerge(trainData,
                                             col,
                                             'y',
                                             max_interval=max_interval,
                                             special_attribute=[],
                                             minBinPcnt=0)
        trainData[col + '_Bin'] = trainData[col].map(
            lambda x: scorecard_function.AssignBin(
                x, cutOff, special_attribute=[]))
        monotone = scorecard_function.BadRateMonotone(trainData, col + '_Bin',
                                                      'y')  # 检验分箱后的单调性是否满足
        while (not monotone):
            # 检验分箱后的单调性是否满足。如果不满足,则缩减分箱的个数。
            max_interval -= 1
            cutOff = scorecard_function.ChiMerge(trainData,
                                                 col,
                                                 'y',
                                                 max_interval=max_interval,
                                                 special_attribute=[],
Beispiel #2
0
    x2_bin = range(25,105,5)
    x3_bin = range(1,11)
    x7_bin = range(1,11)
    x9_bin = range(1,11)
    x4_bin = list(np.linspace(0.1,0.9,num=9))
    x5_bin = range(1000,14000,1000)+range(20000,50000,10000)
    x6_bin = range(1,20)
    x8_bin = range(1,10)
    x10_bin = range(1,5)


    ## 卡方分箱
    
    print '卡方分箱结果为:'
    for x in X.columns:
        locals()[x+'_chi'] = scorecard_function.ChiMerge(train, x, 'y', max_interval=10, special_attribute=[], minBinPcnt=0) 
        print x , locals()[x+'_chi']   

    ##决策树分箱
    from sklearn import tree
    def decession_tree_bin(X,y,bin):
        clf = tree.DecisionTreeClassifier(criterion = 'entropy',max_leaf_nodes= bin,min_samples_leaf = 0.05).fit(X,y)
        n_nodes = clf.tree_.node_count
        children_left = clf.tree_.children_left
        children_right = clf.tree_.children_right
        threshold = clf.tree_.threshold
        boundary = []
        for i in range(n_nodes):
            if children_left[i]!=children_right[i]:
                boundary.append(threshold[i])
        sort_boundary = sorted(boundary)