def main(): x_train, x_test, y_train, y_test = process_data() # 划分数据集 my_tree = tree_generate(x_train, y_train, max_depth=4, mini_sample=10) # 构造决策树 precision = evaluate(x_test, y_test, my_tree) # 进行推断和评估 print('准确率 = ', precision) treePlotter.create_plot(my_tree) # 画出训练的决策树,便于观察结果
def classify_lenses(): with open("lenses.txt", "r") as f: lenses = [inst.strip().split('\t') for inst in f.readlines()] # 一行为一个样本(特征、类标签) lenses_labels = ["age", "prescript", "astigmatic", "testRate"] # 特征标签 lenses_tree = create_tree(lenses, lenses_labels) print lenses_tree import treePlotter treePlotter.create_plot(lenses_tree)
def get_result(self): """ 得到结果并绘制树 :return: """ data_set = self.train_data labels_tmp = self.attributes[:] decision_tree = self.create_tree(data_set, labels_tmp) print("decisionTree:\n", decision_tree) treePlotter.create_plot(decision_tree) result = self.classify_all(decision_tree) print("result:\n", result)
""" 使用pickle存储决策树 :param input_tree: 决策树 :param filename: 需要写入的文件名 :return: None """ import pickle # 以二进制的方式写入文件中 fw = open(filename, 'wb') pickle.dump(input_tree, fw) fw.close() def grab_tree(filename): """ 读出决策树结构 :param filename: 需要读出的文件名 :return: 返回决策树的内容 """ import pickle fr = open(filename, 'rb') return pickle.load(fr) if __name__ == '__main__': fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lenses_labels = ['age', 'prescript', 'astigmatic', 'tearRate'] lenses_tree = create_tree(lenses, lenses_labels) treePlotter.create_plot(lenses_tree) print(lenses_tree)
def test_glass(): fr = open('lenses.txt') lenses = [line.strip().split('\t') for line in fr.readlines()] lense_labels = ['age', 'precript', 'astigmatic', 'tearRate'] lense_tree = trees.create_tree(lenses, lense_labels) treePlotter.create_plot(lense_tree)
def entroy(self, y): p = pd.value_counts(y) / y.shape[0] # 计算各类样本所占比率 ent = np.sum(-p * np.log2(p)) return ent if __name__ == '__main__': data_path2 = 'watermelon2_0_Ch.txt' data = pd.read_table(data_path2, encoding='utf8', delimiter=',', index_col=0) train = [1, 2, 3, 6, 7, 10, 14, 15, 16, 17] train = [i - 1 for i in train] X = data.iloc[train, :6] y = data.iloc[train, 6] test = [4, 5, 8, 9, 11, 12, 13] test = [i - 1 for i in test] X_val = data.iloc[test, :6] y_val = data.iloc[test, 6] tree = DecisionTree('gini', 'pre_pruning') tree.fit(X, y, X_val, y_val) print("平均准确率为:", np.mean(tree.predict(X_val) == y_val)) treePlotter.create_plot(tree.tree_)