def CCP_cross_validation(TreeSets, alpha_list, X_test, y_test, feature_names, class_names, sklearn_model): precision_list = [] progress_length = len(TreeSets) # print"------------------------------检查下这里------------------------------" # print"X_test,y_test=",X_test # print y_test for index, item in enumerate(TreeSets): Ti_precision = precision_compute(item, X_test, y_test, feature_names, class_names) print "T%d_precision=%f" % (index, Ti_precision) precision_list.append(Ti_precision) print "the T" + str(index) + " has been validated, " + str( progress_length - index - 1) + " Trees left, wait please....." pruned_precision = max(precision_list) index = precision_list.index(pruned_precision) print "index=", index best_alpha = alpha_list[index] Best_tree = TreeSets[index] dot_file = "./visualization/Best_tree.dot" svg_file = "./visualization/Best_tree.svg" #画一画树 best_sklearn_model = copy.deepcopy(sklearn_model) prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree) draw_file(best_sklearn_model, dot_file, svg_file, feature_names) return Best_tree, best_alpha, pruned_precision
def CCP_validation(TreeSets,alpha_list,X_test,y_test,feature_names,class_names,sklearn_model,b_SE): precision_list=[] progress_length=len(TreeSets) # print"------------------------------检查下这里------------------------------" # print"X_test,y_test=",X_test # print y_test for index,item in enumerate(TreeSets): Ti_precision=precision_compute(item,X_test,y_test,feature_names,class_names) print"T%d_precision=%f"%(index,Ti_precision) precision_list.append(Ti_precision) print"the T"+str(index)+" has been validated, "+str(progress_length-index-1)+" Trees left, wait please....." if b_SE==False: pruned_precision=max(precision_list) index=precision_list.index(pruned_precision) print"index=",index best_alpha=alpha_list[index] Best_tree=TreeSets[index] dot_file="./visualization/Best_tree_0SE.dot" svg_file="./visualization/Best_tree_0SE.svg" #画一画树 best_sklearn_model=copy.deepcopy(sklearn_model) prune_sklearn_model(best_sklearn_model.tree_,0,Best_tree) draw_file(best_sklearn_model,dot_file,svg_file,feature_names) return Best_tree,best_alpha,pruned_precision,precision_list[0] else:#使用1-SE rule error_rate_list=[1-item for item in precision_list] lowest_error_rate=min(error_rate_list) print"error_rate_list=",error_rate_list SE=sqrt(lowest_error_rate*(1-lowest_error_rate)/len(y_test)) print"SE=",SE criterion_1_SE=lowest_error_rate+SE index_error_rate=0 for index,item in enumerate(error_rate_list):#search from from the end ,because the error_rate_list is not monotory. if error_rate_list[len(error_rate_list)-1-index]<criterion_1_SE: index_error_rate=len(error_rate_list)-1-index break # if index_error_rate-1>=0: # index_error_rate=index_error_rate-1 # else: # pass#becasuse the list may only have one item. pruned_precision=precision_list[index_error_rate]#here's right,because the precision list is corresponding to the error_rate_list. best_alpha=alpha_list[index_error_rate] Best_tree=TreeSets[index_error_rate] dot_file="./visualization/Best_tree_1SE.dot" svg_file="./visualization/Best_tree_1SE.svg" #画一画树 best_sklearn_model=copy.deepcopy(sklearn_model) prune_sklearn_model(best_sklearn_model.tree_,0,Best_tree) draw_file(best_sklearn_model,dot_file,svg_file,feature_names) return Best_tree,best_alpha,pruned_precision,precision_list[0]
def ECP_1SE_validation(TreeSets, alpha_list, X_test, y_test, feature_names, sklearn_model, b_SE): mse_list = [] progress_length = len(TreeSets) # print"------------------------------检查下这里------------------------------" # print"X_test,y_test=",X_test # print y_test for index, item in enumerate(TreeSets): Ti_mse = mse_compute(item, X_test, y_test, feature_names) print "T%d_mse=%f" % (index, Ti_mse) mse_list.append(Ti_mse) print "the T" + str(index) + " has been validated, " + str( progress_length - index - 1) + " Trees left, wait please....." if b_SE == False: pruned_mse = min(mse_list) index = mse_list.index(pruned_mse) # print"index=",index #------------------代码①处(start)------------------- best_alpha = alpha_list[index] Best_tree = TreeSets[index] dot_file = "./visualization/Best_tree_0SE.dot" svg_file = "./visualization/Best_tree_0SE.svg" #画一画树 print "unpruned_mse=", mse_list[0] best_sklearn_model = copy.deepcopy(sklearn_model) prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree) draw_file(best_sklearn_model, dot_file, svg_file, feature_names) return Best_tree, best_alpha, pruned_mse, mse_list[0] #------------------代码①处(end)------------------- else: min_mse = min(mse_list) SE = sqrt(min_mse * (1 - min_mse) / len(y_test)) criterion_1_SE = min_mse + SE index_mse = 0 for index, item in enumerate(mse_list): if mse_list[ len(mse_list) - 1 - index] < criterion_1_SE: #the mse_list is not Monotonous,so search from the end. index_mse = len(mse_list) - 1 - index break pruned_mse = mse_list[index_mse] #------------------下面代码与上面①一致(start)------------------- best_alpha = alpha_list[index_mse] Best_tree = TreeSets[index_mse] dot_file = "./visualization/Best_tree_1SE.dot" svg_file = "./visualization/Best_tree_1SE.svg" #画一画树 print "unpruned_mse=", mse_list[0] best_sklearn_model = copy.deepcopy(sklearn_model) prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree) draw_file(best_sklearn_model, dot_file, svg_file, feature_names) return Best_tree, best_alpha, pruned_mse, mse_list[0]
def model_json(data_path, name_path, cart_max_depth): ########################################################## feature_names = get_Attribute(name_path) print "data_path=", data_path #------------------------------------------ x_list, y_list = read_data_for_split(data_path, n=0, label=1) #把数据和类别标签列分开。 print "x_list=", x_list print "y_list=", y_list #------------------------------------------ X_train, X_test, y_train, y_test = train_test_split(x_list, y_list, test_size=0.25, random_state=0) print "X_train=", X_train #分别初始化对特征值和目标值的标准化器 ss_X = StandardScaler() ss_y = StandardScaler() #训练数据都是数值型,所以要标准化处理 X_train = np.array(X_train) print "X_train=", X_train X_train = ss_X.fit_transform(np.array(X_train)) X_test = np.array(X_test) X_test = ss_X.transform(np.array(X_test)) y_train = np.array(y_train) y_test = np.array(y_test) #目标数据(房价预测值)也是数值型,所以也要标准化处理 #说明一下:fit_transform与transform都要求操作2D数据,而此时的y_train与y_test都是1D的,因此需要调用reshape(-1,1),例如:[1,2,3]变成[[1],[2],[3]] y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) # print X_train feature_list = get_Attribute(name_path) dtr = DecisionTreeRegressor(max_depth=cart_max_depth, criterion='mse', random_state=0) print "now training,wait please.........." dtr.fit(X_train, y_train) print "train finished" class_names = '' #因为是回归,所以不需要分类名 result = rules(dtr, feature_list, class_names) print "result=", result with open('structure.json', 'w') as f: f.write(json.dumps(result)) print "The json-style model has been stored in structure.json" print "now I'm drawing the CART Regression tree,wait please............" # print dir(data) dot_file = "./visualization/T0.dot" png_file = "./visualization/T0.svg" # draw_file(dtr,dot_file,png_file,X_train,feature_list) draw_file(dtr, dot_file, png_file, feature_list) print "CART tree has been drawn in " + png_file return dtr, result, X_train, y_train, X_test, y_test, feature_list
def model_gtmin_Tt(clf,model,feature_names,class_names,Tt_name):#T0->T1 # print"model=",model Tt=Tt_count(model,0)#|Tt| # print"|Tt|=",Tt Rt=Rt_compute(model) # print"R(t)=",Rt RTt=RTt_compute(model,0) # print"R(Tt)=",RTt # leaves=nodes_all_count(model,0)#no use now # print"all nodes=",leaves gt_list=[] prune_parts=[] gt_with_tree(model,gt_list,prune_parts) # print gt_list # print"---------------------------------------------------------" # print len(gt_list) # print_list(prune_parts) # print"---------------------------------------------------------" # print len(prune_parts) alpha=min(gt_list) prune_gt_index=gt_list.index(alpha) # print"prune_gt_index=",prune_gt_index prune_for_minimum_gt=prune_parts[prune_gt_index]# # print"prune_for_minimum_gt=\n",prune_for_minimum_gt #------------------------------ T0=copy.deepcopy(model) T1=copy.deepcopy(model)#here T1 means Ti gt_list=[]#这里必须复位清零 prune_parts=[]#这里必须复位清零 T1_create(T1,gt_list,prune_parts,prune_gt_index) #这里不使用上面的prune_for_minimum的原因是,这个被裁掉的部分,你不知道处于哪个结点下面. #也就是说,你虽然知道要裁掉的子树是什么,但是你无法知道在哪里裁,所以这里对prune_parts进行重新构建 #from T0(original model) to get T1 #print"\nT0=",model #print "\nT1=",T1 index=0#never change this value!!! sklearn_model=copy.deepcopy(clf) prune_sklearn_model(sklearn_model.tree_,index,T1) dot_file="./visualization/T"+Tt_name+".dot" png_file="./visualization/T"+Tt_name+".svg" # draw_file(sklearn_model,dot_file,png_file,X_train,feature_names) draw_file(sklearn_model,dot_file,png_file,feature_names) return sklearn_model,T1,alpha
def model_gtmin_Tt(dtr, model, feature_names, Tt_name): #还没修改完成 # print"model=",model Tt = Tt_count(model, 0) #|Tt| # print"|Tt|=",Tt Rt = Rt_compute(model) # print"R(t)=",Rt RTt = RTt_compute(model, 0) # print"R(Tt)=",RTt # leaves=nodes_all_count(model,0)#no use now # print"all nodes=",leaves gt_list = [] prune_parts = [] gt_with_tree(model, gt_list, prune_parts) print "gt_list=", gt_list print "prune_parts=", prune_parts print "len(prune_parts)=", len(prune_parts) # print"---------------------------------------------------------" # print len(gt_list) # print_list(prune_parts) # print"---------------------------------------------------------" # print len(prune_parts) print "model=", model alpha = min(gt_list) prune_gt_index = gt_list.index(alpha) # print"prune_gt_index=",prune_gt_index prune_for_minimum_gt = prune_parts[prune_gt_index] # print"prune_for_minimum_gt=\n",prune_for_minimum_gt #------------------------------ T0 = copy.deepcopy(model) T1 = copy.deepcopy(model) #here T1 means Ti gt_list = [] #这里必须复位清零 pruned_parts = [] #这里必须复位清零 T1_create(T1, gt_list, pruned_parts, prune_gt_index) print "pruned_parts=", pruned_parts index = 0 #never change this value!!! sklearn_model = copy.deepcopy(dtr) prune_sklearn_model(sklearn_model.tree_, index, T1) dot_file = "./visualization/T" + Tt_name + ".dot" png_file = "./visualization/T" + Tt_name + ".svg" # draw_file(sklearn_model,dot_file,png_file,X_train,feature_names) draw_file(sklearn_model, dot_file, png_file, feature_names) return sklearn_model, T1, alpha