Esempio n. 1
0
    def train(self, data, labels):
        self._data = data
        self._labels = labels
        numAttributes = len(data[0])  # should be same across all data points

        pAttr = list(
            range(numAttributes))  # list of attributes to possibly split on
        rlist = list(range(
            len(data)))  # initially all points are remaining in the data set
        self._root = Tree.Tree()
        self._createTree(self._root, pAttr, rlist)
        # no longer need to remember the training data
        self._data = []
        self._labels = []
Esempio n. 2
0
    def _createTree(self, tree, pAttr, rlist):
        # check if all members of subset are classified the same
        first_label = self._labels[rlist[0]]
        for r in range(1, len(rlist)):
            i = rlist[r]  # relevant index of a data point
            lbl = self._labels[i]
            if lbl != first_label:
                break
            if r == (len(rlist) - 1):
                tree.final_label = first_label
                return

        # when there are no attributes left to split on
        if len(pAttr) == 0:
            tree.chooseBest(self._getLabelCount(rlist))
            return

        gains = [0 for i in range(len(pAttr))]
        for i in range(len(pAttr)):  # iterate over attribute array
            a = pAttr[i]  # current attribute
            gains[i] = self._gain(a, rlist)

        maxGain = 0  # index of max gain
        for i in range(len(gains)):
            if gains[i] > gains[maxGain]:
                maxGain = i

        # if all gains are 0 stop branching and use the most popular label
        # (in some data sets there may be duplicate vectors with different classifications)
        if gains[maxGain] == 0:
            tree.chooseBest(self._getLabelCount(rlist))
            return

        tree.attr = pAttr[maxGain]  # attribute to split on
        del pAttr[maxGain]  # remove attribute we're using from list

        vals_dist = self._valDistribution(tree.attr, rlist)

        # possible (remaining) vals for this attribute to take on
        tree.vals = list(vals_dist)
        tree.subTrees = [Tree.Tree() for i in range(len(tree.vals))]

        # iterate over each value to branch off of
        # use deep copies of pAttr!
        for i in range(len(tree.vals)):
            # recursively create tree
            self._createTree(tree.subTrees[i], deepcopy(pAttr),
                             vals_dist[tree.vals[i]])
Esempio n. 3
0
X = iris.data
y = iris.target


def getAccuracy(pred_y, actu_y):
    if len(pred_y) != len(actu_y):
        raise ("The f**k?")
    correct = 0
    for i in range(0, len(pred_y)):
        if pred_y[i] == actu_y[i]:
            correct += 1

    return correct / len(pred_y)


t = Tree(max_depth=DEPTH, PFSRT=True, omega=1.5)

t.train(X, y)
t.printTree()

t.updatePFSRT()

acc = []
for i in range(0, NR_TREES):
    t.train()  # train with same data
    t.updatePFSRT()
    acc.append((i, t._cur_accuracy))

acc = sorted(acc, key=lambda kv: kv[1])
for tup in acc:
    print("Tree " + str(tup[0]) + " acc:", tup[1])
Esempio n. 4
0
y = iris.target


def getAccuracy(pred_y, actu_y):
    if len(pred_y) != len(actu_y):
        raise ("The f**k?")
    correct = 0
    for i in range(0, len(pred_y)):
        if pred_y[i] == actu_y[i]:
            correct += 1

    return correct / len(pred_y)


st1 = time()
t = Tree(max_depth=DEPTH)
t.train(X, y)
en1 = time()

t.printTree()

print(y)
st1p = time()
y_pred = t.predict(X)
en1p = time()
#print(y_pred)

print(getAccuracy(y, y_pred))

clf = DecisionTreeClassifier(criterion='entropy', max_depth=DEPTH)
st2 = time()
Esempio n. 5
0
def getAccuracy(pred_y, actu_y):
    if len(pred_y) != len(actu_y):
        raise ("The f**k?")
    correct = 0
    for i in range(0, len(pred_y)):
        if pred_y[i] == actu_y[i]:
            correct += 1

    return correct / len(pred_y)


st1 = time()
t = []
# generate 10 random trees and look at the dist
for i in range(0, NR_TREES):
    t.append(Tree(max_depth=DEPTH, random_feat=True))
    t[i].train(X, y)
en1 = time()

# for i in range(0,NR_TREES):
#     print("Tree",i)
#     t[i].printTree()

print(y)
st1p = time()
y_pred = []
for i in range(0, NR_TREES):
    y_pred.append(t[i].predict(X))
en1p = time()

acc_tuples = []
Esempio n. 6
0
def do_experiments(X, y, depth, nr_rand_trees, data_label):
    # BASIC TREE
    print("\n------ BASIC TREE ------\n")
    print("Depth: ", depth)
    t = Tree(max_depth=depth)

    st1 = time()
    t.train(X, y)
    en1 = time()
    st1p = time()
    y_pred = t.predict(X)
    en1p = time()

    basic_acc = accuracy_score(y, y_pred)
    print(Get_ConfusionMatrix(y, y_pred))
    print("F-Score: ", Get_F_Score(y, y_pred))
    print("Accuracy: ", basic_acc)
    print("Time to train:", en1 - st1)
    print("Time to test:", en1p - st1p)

    # this is 10 fold cross validation
    cv_arr = cross_validate(t, X, y, cv=10)
    print("Accuracy after 10-fold CV:",
          float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")")

    if len(np.unique(y)) == 2:
        Generate_ROC_Curve(y, t.getClassProb(X), "ID3", label_text=data_label)
    # BASIC TREE END
    # RANDOM TREES
    print("\n------ RANDOM TREE ------\n")
    print("Depth: ", depth)
    print("Nr trees: ", nr_rand_trees)
    t2 = Tree(max_depth=depth, random_feat=True)

    t2_max = None
    y2_pred = None
    acc_max = 0

    iterations_taken = nr_rand_trees
    acc_list = []
    max_accs = []

    st2 = time()
    for i in range(0, nr_rand_trees):
        t2.train(X, y)
        y2_pred = t2.predict(X)
        acc = accuracy_score(y2_pred, y)
        acc_list.append(acc)
        if acc >= basic_acc:
            iterations_taken = i + 1
            t2_max = t2
            acc_max = acc
            break
        if acc > acc_max:
            acc_max = acc
            t2_max = t2
        max_accs.append(acc_max)

    en2 = time()
    st2p = time()
    y2_pred = t2_max.predict(X)
    en2p = time()

    print(Get_ConfusionMatrix(y, y2_pred))
    print("F-Score: ", Get_F_Score(y, y2_pred))
    print("Accuracy: ", acc_max)
    print("Time to train:", en2 - st2)
    print("Iterations taken:", iterations_taken)
    print("Time to test:", en2p - st2p)

    # this is 10 fold cross validation
    cv_arr = cross_validate(t2_max, X, y, cv=10)
    print("Accuracy after 10-fold CV:",
          float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")")

    random_decision_tree_accuracy(acc_list, label_text=data_label)
    accuracyRiseForRandomTrees(max_accs, label_text=data_label)

    if len(np.unique(y)) == 2:
        Generate_ROC_Curve(y,
                           t2_max.getClassProb(X),
                           "Random Forest",
                           label_text=data_label)
    # RANDOM TREES END
    # LOOKAHEAD TREE
    print("\n------ LOOKAHEAD TREE ------\n")
    print("Depth: ", depth)
    t3 = Tree(max_depth=depth, lookahead=True)

    st3 = time()
    t3.train(X, y)
    en3 = time()
    print("yo")
    st3p = time()
    y3_pred = t3.predict(X)
    en3p = time()
    print("yu")

    basic_acc = accuracy_score(y, y3_pred)
    print(Get_ConfusionMatrix(y, y3_pred))
    print("F-Score: ", Get_F_Score(y, y3_pred))
    print("Accuracy: ", accuracy_score(y, y3_pred))
    print("Time to train:", en3 - st3)
    print("Time to test:", en3p - st3p)

    # this is 10 fold cross validation
    cv_arr = cross_validate(t3, X, y, cv=5)
    print("Accuracy after 10-fold CV:",
          float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")")

    if len(np.unique(y)) == 2:
        Generate_ROC_Curve(y,
                           t3.getClassProb(X),
                           "Lookahead DT",
                           label_text=data_label)
Esempio n. 7
0
X=iris.data
y=iris.target

def getAccuracy(pred_y, actu_y):
    if len(pred_y) != len(actu_y):
        raise("The f**k?")
    correct = 0
    for i in range(0, len(pred_y)):
        if pred_y[i] == actu_y[i]:
            correct += 1

    return correct/len(pred_y)

print(X)
st1 = time()
t=Tree(max_depth=DEPTH, lookahead=True)
t.train(X, y)
en1 = time()

t.printTree()

print(y)
st1p = time()
y_pred=t.predict(X)
en1p = time()
print(y_pred)

print(getAccuracy(y, y_pred))

clf = DecisionTreeClassifier(criterion='entropy', max_depth=DEPTH)
st2 = time()