def sk_learn(data="oldGames.arff", min_split=300, min_leaf=15):
    folds = 10
    mat = Arff(data, label_count=1)

    counts = []  ## this is so you know how many types for each column
    for i in range(mat.data.shape[1]):
        counts += [mat.unique_value_count(i)]

    # np.random.seed(35)
    np.random.shuffle(mat.data)
    splits = np.array_split(mat.data, folds)

    Acc = 0
    # min_split = 300
    # print("Minsplit: {}".format(min_split))
    for f in range(folds):
        # print("Fold {}:".format(f))
        train = np.array([])
        for other in range(folds):
            if train.size == 0 and other != f:
                train = splits[other].copy()
            elif other != f:
                train = np.concatenate((train, splits[other]))

        data = train[:, 0:-1]
        labels = train[:, -1].reshape(-1, 1)

        clf = tree.DecisionTreeClassifier(
        )  #min_samples_split=min_split, min_samples_leaf=min_leaf
        clf = clf.fit(data, labels)
        pred = clf.predict(data)
        new_acc = score(pred, labels)
        # print("\tTrain Acc {}".format(new_acc))

        data2 = splits[f][:, 0:-1]
        labels2 = splits[f][:, -1].reshape(-1, 1)
        pred = clf.predict(data2)
        new_acc = score(pred, labels2)
        # print("\tTest Acc {}".format(new_acc))
        Acc += new_acc

    Acc = Acc / folds
    print("Accuracy = [{:.4f}]".format(Acc))

    classes = [
        "Overwhelmingly_Positive", "Very_Positive", "Positive",
        "Mostly_Positive", "Mixed", "Mostly_Negative", "Negative",
        "Very_Negative", "Overwhelmingly_Negative"
    ]
    dot_data = tree.export_graphviz(clf,
                                    out_file=None,
                                    feature_names=mat.get_attr_names()[:-1],
                                    class_names=classes,
                                    filled=True,
                                    rounded=True)  # max_depth=6,
    graph = graphviz.Source(dot_data)
    graph.render("old_games")

    return Acc
Esempio n. 2
0
def all_lenses():
    print("---------all-lenses----------")

    lens_data = Arff("./lenses.arff", label_count=1)
    all_lens_data = Arff("./all_lenses.arff", label_count=1)

    lens_train = lens_data.data[:, :-1]
    lens_label_train = lens_data.data[:, -1].reshape(-1, 1)
    lens_test = all_lens_data.data[:, :-1]
    lens_label_test = all_lens_data.data[:, -1].reshape(-1, 1)

    dtree = DTClassifier(features=lens_data.get_attr_names())
    dtree.fit(lens_train, lens_label_train)
    score = dtree.score(lens_test, lens_label_test)
    print("Train Accuracy=[{:.2f}]".format(
        dtree.score(lens_train, lens_label_train)))
    print("Accuracy=[{:.2f}]".format(score))
Esempio n. 3
0
def voting():
    print("----------------voting------------------")

    mat = Arff("./voting.arff", label_count=1)
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1]#.reshape(-1, 1)
    splits = 10
    kfolder = KFold(n_splits=splits)

    scores = [[], []]

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)
    best_tree = (0, None)
    for train, validate in kfolder.split(data, labels):
        # print(train, validate)
        dtree = DTClassifier(features=mat.get_attr_names())
        dtree.fit(data[train], labels[train])

        scores[0].append(dtree.score(data[validate], labels[validate]))
        scores[1].append(dtree.score(data[train], labels[train]))
        if scores[0][-1] > best_tree[0]:
            best_tree = (scores[0][-1], dtree)

    average = np.sum(scores, axis=1) / splits
    scores[0].append(average[0])
    scores[1].append(average[1])
    header_text = ''
    for x in range(splits):
        header_text = header_text + str(x) + ' '

    np.savetxt("voting.csv",
               scores,
               header=header_text + 'average',
               delimiter=',')
    print(scores)
    print('Average CV accuracy: {:.2f}'.format(scores[0][-1]))
    print('Best tree accuracy: {:.2f}'.format(best_tree[1].score(
        tData, tLabels)))
    f = open("voting_tree", "w")
    f.write(dtree.graph(class_translator=lambda x: mat.attr_value(-1, x)))
    f.close()
Esempio n. 4
0
def nan_lenses():
    print("----------------nan_lenses------------------")

    mat = Arff("./nan_lenses.arff", label_count=1)
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1].reshape(-1, 1)

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)

    dtree = DTClassifier(features=mat.get_attr_names())
    dtree.fit(data, labels)
    print(dtree.tree)

    # results = dtree.predict(tData)
    # for r, t in zip(results, tLabels):
    #     print(r, t)

    score = dtree.score(tData, tLabels)
    print("Accuracy=[{:.2f}]".format(score))
Esempio n. 5
0
def evaluation():
    print("----------------evaluation---------------")

    zoo_data = Arff("./zoo.arff", label_count=1)
    all_zoo_data = Arff("./all_zoo.arff", label_count=1)

    zoo_train = zoo_data.data[:, :-1]
    zoo_label_train = zoo_data.data[:, -1].reshape(-1, 1)
    zoo_test = all_zoo_data.data[:, :-1]
    zoo_label_test = all_zoo_data.data[:, -1].reshape(-1, 1)

    dtree = DTClassifier(features=zoo_data.get_attr_names())
    dtree.fit(zoo_train, zoo_label_train)
    print("Train Accuracy=[{:.2f}]".format(
        dtree.score(zoo_train, zoo_label_train)))

    predicted = dtree.predict(zoo_test)
    np.savetxt('predicted_zoo.csv',
               predicted,
               delimiter=',',
               header="predicted")
    score = dtree.score(zoo_test, zoo_label_test)
    print("Accuracy=[{:.2f}]".format(score))
Esempio n. 6
0
import pydotplus
from sklearn import tree
import collections
from graphviz import Source
from IPython.display import Image


mat = Arff("datasets/tictactoe.arff")

counts = [] ## this is so you know how many types for each column
for i in range(mat.data.shape[1]):
       counts += [mat.unique_value_count(i)]
data = mat.data[:,0:-1]
labels = mat.data[:,-1].reshape(-1,1)

labelNames =  mat.get_attr_names() #[char for char in string.ascii_lowercase[:data.shape[1]]]
#del labelNames[-1]

DTSClass = DTClassifier(counts, labelNames, shuffle=True)

##########10 fold CV#########################
# # #split into 10 groups
# sData = np.array_split(data, 10, 0)
# sLabels = np.array_split(labels, 10, 0)
# accs = []

# # clf = tree.DecisionTreeClassifier(min_impurity_decrease=2)

# for i in range(1):
#        #print(i, inputs[i])
#        vData = np.copy(sData[i])