def sk_learn(data="oldGames.arff", min_split=300, min_leaf=15): folds = 10 mat = Arff(data, label_count=1) counts = [] ## this is so you know how many types for each column for i in range(mat.data.shape[1]): counts += [mat.unique_value_count(i)] # np.random.seed(35) np.random.shuffle(mat.data) splits = np.array_split(mat.data, folds) Acc = 0 # min_split = 300 # print("Minsplit: {}".format(min_split)) for f in range(folds): # print("Fold {}:".format(f)) train = np.array([]) for other in range(folds): if train.size == 0 and other != f: train = splits[other].copy() elif other != f: train = np.concatenate((train, splits[other])) data = train[:, 0:-1] labels = train[:, -1].reshape(-1, 1) clf = tree.DecisionTreeClassifier( ) #min_samples_split=min_split, min_samples_leaf=min_leaf clf = clf.fit(data, labels) pred = clf.predict(data) new_acc = score(pred, labels) # print("\tTrain Acc {}".format(new_acc)) data2 = splits[f][:, 0:-1] labels2 = splits[f][:, -1].reshape(-1, 1) pred = clf.predict(data2) new_acc = score(pred, labels2) # print("\tTest Acc {}".format(new_acc)) Acc += new_acc Acc = Acc / folds print("Accuracy = [{:.4f}]".format(Acc)) classes = [ "Overwhelmingly_Positive", "Very_Positive", "Positive", "Mostly_Positive", "Mixed", "Mostly_Negative", "Negative", "Very_Negative", "Overwhelmingly_Negative" ] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=mat.get_attr_names()[:-1], class_names=classes, filled=True, rounded=True) # max_depth=6, graph = graphviz.Source(dot_data) graph.render("old_games") return Acc
def all_lenses(): print("---------all-lenses----------") lens_data = Arff("./lenses.arff", label_count=1) all_lens_data = Arff("./all_lenses.arff", label_count=1) lens_train = lens_data.data[:, :-1] lens_label_train = lens_data.data[:, -1].reshape(-1, 1) lens_test = all_lens_data.data[:, :-1] lens_label_test = all_lens_data.data[:, -1].reshape(-1, 1) dtree = DTClassifier(features=lens_data.get_attr_names()) dtree.fit(lens_train, lens_label_train) score = dtree.score(lens_test, lens_label_test) print("Train Accuracy=[{:.2f}]".format( dtree.score(lens_train, lens_label_train))) print("Accuracy=[{:.2f}]".format(score))
def voting(): print("----------------voting------------------") mat = Arff("./voting.arff", label_count=1) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1]#.reshape(-1, 1) splits = 10 kfolder = KFold(n_splits=splits) scores = [[], []] data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) best_tree = (0, None) for train, validate in kfolder.split(data, labels): # print(train, validate) dtree = DTClassifier(features=mat.get_attr_names()) dtree.fit(data[train], labels[train]) scores[0].append(dtree.score(data[validate], labels[validate])) scores[1].append(dtree.score(data[train], labels[train])) if scores[0][-1] > best_tree[0]: best_tree = (scores[0][-1], dtree) average = np.sum(scores, axis=1) / splits scores[0].append(average[0]) scores[1].append(average[1]) header_text = '' for x in range(splits): header_text = header_text + str(x) + ' ' np.savetxt("voting.csv", scores, header=header_text + 'average', delimiter=',') print(scores) print('Average CV accuracy: {:.2f}'.format(scores[0][-1])) print('Best tree accuracy: {:.2f}'.format(best_tree[1].score( tData, tLabels))) f = open("voting_tree", "w") f.write(dtree.graph(class_translator=lambda x: mat.attr_value(-1, x))) f.close()
def nan_lenses(): print("----------------nan_lenses------------------") mat = Arff("./nan_lenses.arff", label_count=1) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1].reshape(-1, 1) data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) dtree = DTClassifier(features=mat.get_attr_names()) dtree.fit(data, labels) print(dtree.tree) # results = dtree.predict(tData) # for r, t in zip(results, tLabels): # print(r, t) score = dtree.score(tData, tLabels) print("Accuracy=[{:.2f}]".format(score))
def evaluation(): print("----------------evaluation---------------") zoo_data = Arff("./zoo.arff", label_count=1) all_zoo_data = Arff("./all_zoo.arff", label_count=1) zoo_train = zoo_data.data[:, :-1] zoo_label_train = zoo_data.data[:, -1].reshape(-1, 1) zoo_test = all_zoo_data.data[:, :-1] zoo_label_test = all_zoo_data.data[:, -1].reshape(-1, 1) dtree = DTClassifier(features=zoo_data.get_attr_names()) dtree.fit(zoo_train, zoo_label_train) print("Train Accuracy=[{:.2f}]".format( dtree.score(zoo_train, zoo_label_train))) predicted = dtree.predict(zoo_test) np.savetxt('predicted_zoo.csv', predicted, delimiter=',', header="predicted") score = dtree.score(zoo_test, zoo_label_test) print("Accuracy=[{:.2f}]".format(score))
import pydotplus from sklearn import tree import collections from graphviz import Source from IPython.display import Image mat = Arff("datasets/tictactoe.arff") counts = [] ## this is so you know how many types for each column for i in range(mat.data.shape[1]): counts += [mat.unique_value_count(i)] data = mat.data[:,0:-1] labels = mat.data[:,-1].reshape(-1,1) labelNames = mat.get_attr_names() #[char for char in string.ascii_lowercase[:data.shape[1]]] #del labelNames[-1] DTSClass = DTClassifier(counts, labelNames, shuffle=True) ##########10 fold CV######################### # # #split into 10 groups # sData = np.array_split(data, 10, 0) # sLabels = np.array_split(labels, 10, 0) # accs = [] # # clf = tree.DecisionTreeClassifier(min_impurity_decrease=2) # for i in range(1): # #print(i, inputs[i]) # vData = np.copy(sData[i])