Exemple #1
0
def test_dataset4():
    ''' (2 points) test dataset4'''
    n= 400
    X, Y = RF.load_dataset()

    assert X.shape == (16,400)
    assert Y.shape == (400,)
    d = DT()
    # train over half of the dataset
    t = d.train(X[:,::2],Y[::2]) 
    # test on the other half
    Y_predict = DT.predict(t,X[:,1::2]) 
    accuracy0 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a decision tree:', accuracy0)

    b = Bag()
    # train over half of the dataset
    T = b.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = Bag.predict(T,X[:,1::2]) 
    accuracy1 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a bagging of 21 trees:', accuracy1)

    r = RF()
    # train over half of the dataset
    T = r.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = RF.predict(T,X[:,1::2]) 
    accuracy2 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a random forest of 21 trees:', accuracy2)
    assert accuracy1 >= accuracy0
    assert accuracy2 >= accuracy0
    assert accuracy2 >= accuracy1-.05
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        lab = np.array([])
        for t in T:
            lab = np.append(lab, DT.inference(t, x))

        stat = []
        for i in range(len(lab)):
            stat.append([lab[i], A[i]])

        d = dict()
        for i in range(len(lab)):
            if lab[i] not in d:
                d[lab[i]] = A[i]
            else:
                d[lab[i]] += A[i]

        y = DT.most_common(d)
        #########################################
        return y
 def build_tree(self, X,Y,D):
     '''
         build decision stump by overwritting the build_tree function in DT class.
         Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
         Input:
             X: the feature matrix, a numpy matrix of shape p by n.
                Each element can be int/float/string.
                Here n is the number data instances in the node, p is the number of attributes.
             Y: the class labels, a numpy array of length n. Each element can be int/float/string.
             D: the weights of instances, a numpy float vector of length n
         Return:
             t: the root node of the decision stump.
     '''
     #########################################
     ## INSERT YOUR CODE HERE
     t = Node(X,Y)
     # if Condition 1 or 2 holds, stop splitting
     t.p = self.most_common(t.Y,D)
     if DT.stop1(t.Y) or DT.stop2(t.X):
         t.isleaf = True
         return t
     # find the best attribute to split
     t.i,t.th = self.best_attribute(t.X,t.Y,D)
     # configure each child node
     ind1 = []
     ind2 = []
     for j,x in enumerate(X[t.i,:]):
         if x < t.th:
             ind1.append(j)
         else:
             ind2.append(j)
     X1 = X[:,ind1]
     Y1 = Y[ind1]
     t.C1 = Node(X1,Y1,isleaf = True)
     D1 = D[ind1]
     s = float(sum(D1))
     for i,w in enumerate(D[ind1]):
         D1[i] = float(w)/s
     t.C1.p = self.most_common(Y1,D1)
     X2 = X[:,ind2]
     Y2 = Y[ind2]
     t.C2 = Node(X2,Y2,isleaf = True)
     D2 = D[ind2]
     s = float(sum(D2))
     for i,w in enumerate(D[ind2]):
         D2[i] = float(w)/s
     t.C2.p = self.most_common(Y2,D2)
     #########################################
     return t
    def best_threshold(X,Y,D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted.
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar.
                g: the weighted information gain by using the best threhold, a float scalar.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        cp = DT.cutting_points(X,Y)
        ig = []
        cp = list(cp)
        for p in cp:
            newX = X.copy()
            for i,x in enumerate(newX):
                if x < p:
                    newX[i] = 0
                else:
                    newX[i] = 1
            ig.append(DS.information_gain(Y,newX,D))

        g = max(ig)
        th = cp[ig.index(g)]

        if th == float('-inf'):
            g = -1

        #########################################
        return th,g
Exemple #5
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance.
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        y = []
        for t in T:
            y.append(DT.inference(t, x))
        y = np.array(y)
        count = {}
        for yy in y:
            try:
                count[yy] += 1
            except KeyError:
                count[yy] = 1
        num = 0
        for k, v in count.iteritems():
            if v > num:
                num = v
                y = k
        #########################################
        return y
Exemple #6
0
 def best_threshold(X, Y, D):
     '''
         Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
         Input:
             X: a list of values, a numpy array of int/float values.
             Y: a list of values, a numpy array of int/float/string values.
             D: the weights of instances, a numpy float vector of length n
         Output:
         Output:
             th: the best threhold, a float scalar. 
             g: the weighted information gain by using the best threhold, a float scalar. 
     '''
     #########################################
     ## INSERT YOUR CODE HERE
     cp = DT.cutting_points(X, Y)
     th = -1
     g = -1
     try:
         for v in cp:
             XX = np.copy(X)
             XX = np.array(["T" if x > v else "F" for x in XX])
             ig = DS.information_gain(Y, XX, D)
             if ig > g:
                 th = v
                 g = ig
     except TypeError:
         return -float('Inf'), -1
     #########################################
     return th, g
Exemple #7
0
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        cp = DT.cutting_points(X, Y)
        if type(cp) == type(np.array([1])):
            g = -1
            th = float('-inf')
            for i in cp:
                a = (np.ma.masked_where(X > i, X)).mask
                if DS.information_gain(Y, a, D) > g:
                    g = DS.information_gain(Y, a, D)
                    th = i

        else:
            g = -1
            th = float('-inf')

        #########################################
        return th, g
Exemple #8
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance.
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        ys = []
        for tree in T:
            ys.append(DT.inference(tree, x))
        d = {}
        for item in ys:
            if item in d.keys():
                d[item] += 1
            else:
                d[item] = 1
        y = sorted(d, key=lambda x: d[x])[-1]

        #########################################
        return y
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted.
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar.
                g: the weighted information gain by using the best threhold, a float scalar.
        '''
        #########################################
        # INSERT YOUR CODE HERE
        # Find a list of possible cutting points
        threshold_potential = DT.cutting_points(X, Y)

        # Check for input of threhodl potential
        if np.all(threshold_potential == -np.inf):
            return -float('inf'), -1

        # Getting infomation_gain list
        info_list = []
        for threshold in np.nditer(threshold_potential):
            info_list.append(DS.information_gain(Y, X >= threshold, D))

        # Getting the best threshold and information gain using best threshold
        g = max(info_list)
        th = threshold_potential[np.argmax(info_list)]

        #########################################
        return th, g
Exemple #10
0
    def load_dataset(filename='data3.csv'):
        '''
            Load dataset 3 from the CSV file:data3.csv. 
            The first row of the file is the header (including the names of the attributes)
            In the remaining rows, each row represents one data instance.
            The first column of the file is the label to be predicted.
            In remaining columns, each column represents an attribute.
            Input:
                filename: the filename of the dataset, a string.
            Output:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element is a float number
                   Here n is the number data instances in the dataset, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element is an integer
            Hint: you could use np.loadtxt()
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        X, Y = DT.load_dataset(filename)
        Y = Y.astype(int)

        #########################################
        return X,Y
Exemple #11
0
    def train(self, X, Y, n_tree=11):
        '''
            Given a training set, train a bagging ensemble of decision trees. 
            Input:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element can be int/float/string.
                   Here n is the number data instances in the training set, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
                n_tree: the number of trees in the ensemble
            Output:
                T: a list of the root of each tree, a list of length n_tree.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        T = []
        for _ in range(n_tree):
            #X, Y = Bag.bootstrap(X,Y)
            t = DT().train(X, Y)
            T.append(t)
        # Could also import Node and do like this
        '''for _ in range(n_tree):
        t = Node(X,Y)
        DT().build_tree(t)
        T.append(t)'''

        #########################################
        return T
Exemple #12
0
    def inference(T,x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        tmp = []
        for t in T:
            tmp.append(DT.inference(t, x))

        stat = dict()
        for i in tmp:
            if i not in stat:
                stat[i] = 1
            else:
                stat[i] += 1
        num = 0
        for sta in stat:
            if stat[sta] > num:
                num = stat[sta]
                y = sta
        #########################################
        return y
Exemple #13
0
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        y_list = []
        b = 0
        for i in T:
            y_list.append(DT.inference(i, x))
        for i in set(y_list):
            a = 0
            for j in range(len(A)):
                if y_list[j] == i:
                    a += A[j]
            if a > b:
                b = a
                y = i

        #########################################
        return y
Exemple #14
0
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y)
        t.p = DS.most_common(Y, D)
        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(Y) or DT.stop2(X):
            t.isleaf = True
            return t

        # find the best attribute to split
        t.i, t.th = self.best_attribute(X, Y, D)

        # configure each child node
        t.C1, t.C2 = self.split(t.X, t.Y, t.i, t.th)

        D1, D2 = [], []
        for j in range(len(D)):
            if X[t.i, j] < t.th:
                D1.append(D[j])
            else:
                D2.append(D[j])
        D1 = np.array(D1)
        D2 = np.array(D2)

        t.C1.p = DS.most_common(t.C1.Y, D1)
        t.C2.p = DS.most_common(t.C2.Y, D2)

        t.C1.isleaf = True
        t.C2.isleaf = True

        #########################################
        return t
Exemple #15
0
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y)
        t.p = DS.most_common(t.Y, D)

        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(t.Y) == False and DT.stop2(t.X) == False:
            t.i, t.th = DS().best_attribute(t.X, t.Y, D)
            t.C1, t.C2 = DT.split(t.X, t.Y, t.i, t.th)
            d1 = D[np.where(X[t.i] < t.th)]
            d2 = D[np.where(X[t.i] >= t.th)]
            t.C1.p = DS.most_common(t.C1.Y, d1)
            t.C2.p = DS.most_common(t.C2.Y, d2)
            t.C1.isleaf = True
            t.C2.isleaf = True

        else:
            t.isleaf = True

        # find the best attribute to split

        # configure each child node

        #########################################
        return t
Exemple #16
0
    def train(self, X, Y, n_tree=11):
        '''
            Given a training set, train a bagging ensemble of decision trees.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element can be int/float/string.
                   Here n is the number data instances in the training set, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
                n_tree: the number of trees in the ensemble
            Output:
                T: a list of the root of each tree, a list of length n_tree.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        T = []
        for tree in range(n_tree):
            X1, Y1 = Bag.bootstrap(X, Y)
            d = DT()
            T.append(d.train(X1, Y1))

        #########################################
        return T
Exemple #17
0
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y, isleaf=False)
        t.p = DS.most_common(Y, D)
        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(Y) or DT.stop2(X):
            t.isleaf = True
            return t

        # find the best attribute to split
        t.i, t.th = self.best_attribute(X, Y, D)

        # configure each child node
        t.C1 = Node(X[:, X[t.i, :] < t.th],
                    Y[X[t.i, :] < t.th],
                    isleaf=True,
                    p=DS.most_common(Y[X[t.i, :] < t.th], D[X[t.i, :] < t.th]))
        t.C2 = Node(X[:, X[t.i, :] >= t.th],
                    Y[X[t.i, :] >= t.th],
                    isleaf=True,
                    p=DS.most_common(Y[X[t.i, :] >= t.th],
                                     D[X[t.i, :] >= t.th]))
        #########################################
        return t
Exemple #18
0
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        Y = []
        Y = [DT.inference(t, x) for t in T]
        y = DS.most_common(Y, A)

        #########################################
        return y
Exemple #19
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infering the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''

        Y = []

        for t in T:
            inference = DT.inference(t, x)
            Y.append(inference)
        y = max(set(Y), key=Y.count)

        return y
Exemple #20
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        list_y = []
        for t in T:
            list_y.append(DT.inference(t, x))
        y = max(list_y, key=list_y.count)

        #########################################
        return y
Exemple #21
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        y = []
        for i in T:
            y.append(DT.inference(i, x))
        y = np.unique(y, return_counts=True)[0][np.argmax(
            np.unique(y, return_counts=True)[1])]

        #########################################
        return y
Exemple #22
0
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        cp = DT.cutting_points(X, Y)
        th = g = -1

        if type(cp) == float:
            return -float('Inf'), -1

        for c in cp:
            helper = []
            for x in X:
                if x > c:
                    helper.append('L')
                else:
                    helper.append('S')
            # print(DS.entropy(Y, D), DS.conditional_entropy(Y, helper, D))
            # print(helper)
            helper = np.asarray(helper)
            gg = DS.information_gain(Y, helper, D)

            if gg > g:
                th = c
                g = gg
        print(th, g)

        #########################################
        return th, g
Exemple #23
0
    def best_threshold(X,Y,D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n

            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''

        ig = lambda X,Y,threshold,D: DS.information_gain(Y,X>=threshold,D)
        ths = DT.cutting_points(X,Y)
    
        if np.all(ths == -np.inf):
            return -float('inf'),-1
        gs = [ig(X,Y,i,D) for i in ths]
        g = max(gs)
        th = ths[np.argmax(gs)]

        return th,g 
Exemple #24
0
    def step(X, Y, D):
        '''
            Compute one step of Boosting.  
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the current weights of instances, a numpy float vector of length n
            Output:
                t:  the root node of a decision stump trained in this step
                a: (alpha) the weight of the decision stump, a float scalar.
                D: the new weights of instances, a numpy float vector of length n
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = DS().build_tree(X, Y, D)
        Y_ = DT.predict(t, X)
        e = AB.weighted_error_rate(Y, Y_, D)
        a = AB.compute_alpha(e)
        D = AB.update_D(D, a, Y, Y_)

        #########################################
        return t, a, D