br_encoding_dict[col] = br_encoding['bad_rate'] num_features.append(col + '_br_encoding') file2 = open(folderOfData + 'br_encoding_dict.pkl', 'wb+') pickle.dump(br_encoding_dict, file2) file2.close() # (iii)对连续型变量进行分箱,包括(ii)中的变量 continous_merged_dict = {} for col in num_features: print("{} is in processing".format(col)) if -1 not in set(trainData[col]): # -1会当成特殊值处理。如果没有-1,则所有取值都参与分箱 max_interval = 5 # 分箱后的最多的箱数 cutOff = scorecard_function.ChiMerge(trainData, col, 'y', max_interval=max_interval, special_attribute=[], minBinPcnt=0) trainData[col + '_Bin'] = trainData[col].map( lambda x: scorecard_function.AssignBin( x, cutOff, special_attribute=[])) monotone = scorecard_function.BadRateMonotone(trainData, col + '_Bin', 'y') # 检验分箱后的单调性是否满足 while (not monotone): # 检验分箱后的单调性是否满足。如果不满足,则缩减分箱的个数。 max_interval -= 1 cutOff = scorecard_function.ChiMerge(trainData, col, 'y', max_interval=max_interval, special_attribute=[],
x2_bin = range(25,105,5) x3_bin = range(1,11) x7_bin = range(1,11) x9_bin = range(1,11) x4_bin = list(np.linspace(0.1,0.9,num=9)) x5_bin = range(1000,14000,1000)+range(20000,50000,10000) x6_bin = range(1,20) x8_bin = range(1,10) x10_bin = range(1,5) ## 卡方分箱 print '卡方分箱结果为:' for x in X.columns: locals()[x+'_chi'] = scorecard_function.ChiMerge(train, x, 'y', max_interval=10, special_attribute=[], minBinPcnt=0) print x , locals()[x+'_chi'] ##决策树分箱 from sklearn import tree def decession_tree_bin(X,y,bin): clf = tree.DecisionTreeClassifier(criterion = 'entropy',max_leaf_nodes= bin,min_samples_leaf = 0.05).fit(X,y) n_nodes = clf.tree_.node_count children_left = clf.tree_.children_left children_right = clf.tree_.children_right threshold = clf.tree_.threshold boundary = [] for i in range(n_nodes): if children_left[i]!=children_right[i]: boundary.append(threshold[i]) sort_boundary = sorted(boundary)