コード例 #1
0
ファイル: test4.py プロジェクト: msgonsalves/MLWH2
def test_dataset4():
    ''' (2 points) test dataset4'''
    n= 400
    X, Y = RF.load_dataset()

    assert X.shape == (16,400)
    assert Y.shape == (400,)
    d = DT()
    # train over half of the dataset
    t = d.train(X[:,::2],Y[::2]) 
    # test on the other half
    Y_predict = DT.predict(t,X[:,1::2]) 
    accuracy0 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a decision tree:', accuracy0)

    b = Bag()
    # train over half of the dataset
    T = b.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = Bag.predict(T,X[:,1::2]) 
    accuracy1 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a bagging of 21 trees:', accuracy1)

    r = RF()
    # train over half of the dataset
    T = r.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = RF.predict(T,X[:,1::2]) 
    accuracy2 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a random forest of 21 trees:', accuracy2)
    assert accuracy1 >= accuracy0
    assert accuracy2 >= accuracy0
    assert accuracy2 >= accuracy1-.05
コード例 #2
0
ファイル: boosting.py プロジェクト: NTCHANG1289/ML_coursework
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        lab = np.array([])
        for t in T:
            lab = np.append(lab, DT.inference(t, x))

        stat = []
        for i in range(len(lab)):
            stat.append([lab[i], A[i]])

        d = dict()
        for i in range(len(lab)):
            if lab[i] not in d:
                d[lab[i]] = A[i]
            else:
                d[lab[i]] += A[i]

        y = DT.most_common(d)
        #########################################
        return y
コード例 #3
0
 def build_tree(self, X,Y,D):
     '''
         build decision stump by overwritting the build_tree function in DT class.
         Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
         Input:
             X: the feature matrix, a numpy matrix of shape p by n.
                Each element can be int/float/string.
                Here n is the number data instances in the node, p is the number of attributes.
             Y: the class labels, a numpy array of length n. Each element can be int/float/string.
             D: the weights of instances, a numpy float vector of length n
         Return:
             t: the root node of the decision stump.
     '''
     #########################################
     ## INSERT YOUR CODE HERE
     t = Node(X,Y)
     # if Condition 1 or 2 holds, stop splitting
     t.p = self.most_common(t.Y,D)
     if DT.stop1(t.Y) or DT.stop2(t.X):
         t.isleaf = True
         return t
     # find the best attribute to split
     t.i,t.th = self.best_attribute(t.X,t.Y,D)
     # configure each child node
     ind1 = []
     ind2 = []
     for j,x in enumerate(X[t.i,:]):
         if x < t.th:
             ind1.append(j)
         else:
             ind2.append(j)
     X1 = X[:,ind1]
     Y1 = Y[ind1]
     t.C1 = Node(X1,Y1,isleaf = True)
     D1 = D[ind1]
     s = float(sum(D1))
     for i,w in enumerate(D[ind1]):
         D1[i] = float(w)/s
     t.C1.p = self.most_common(Y1,D1)
     X2 = X[:,ind2]
     Y2 = Y[ind2]
     t.C2 = Node(X2,Y2,isleaf = True)
     D2 = D[ind2]
     s = float(sum(D2))
     for i,w in enumerate(D[ind2]):
         D2[i] = float(w)/s
     t.C2.p = self.most_common(Y2,D2)
     #########################################
     return t
コード例 #4
0
    def best_threshold(X,Y,D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted.
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar.
                g: the weighted information gain by using the best threhold, a float scalar.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        cp = DT.cutting_points(X,Y)
        ig = []
        cp = list(cp)
        for p in cp:
            newX = X.copy()
            for i,x in enumerate(newX):
                if x < p:
                    newX[i] = 0
                else:
                    newX[i] = 1
            ig.append(DS.information_gain(Y,newX,D))

        g = max(ig)
        th = cp[ig.index(g)]

        if th == float('-inf'):
            g = -1

        #########################################
        return th,g
コード例 #5
0
ファイル: problem3.py プロジェクト: swqs1989/CS539
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance.
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        y = []
        for t in T:
            y.append(DT.inference(t, x))
        y = np.array(y)
        count = {}
        for yy in y:
            try:
                count[yy] += 1
            except KeyError:
                count[yy] = 1
        num = 0
        for k, v in count.iteritems():
            if v > num:
                num = v
                y = k
        #########################################
        return y
コード例 #6
0
ファイル: problem5.py プロジェクト: swqs1989/CS539
 def best_threshold(X, Y, D):
     '''
         Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
         Input:
             X: a list of values, a numpy array of int/float values.
             Y: a list of values, a numpy array of int/float/string values.
             D: the weights of instances, a numpy float vector of length n
         Output:
         Output:
             th: the best threhold, a float scalar. 
             g: the weighted information gain by using the best threhold, a float scalar. 
     '''
     #########################################
     ## INSERT YOUR CODE HERE
     cp = DT.cutting_points(X, Y)
     th = -1
     g = -1
     try:
         for v in cp:
             XX = np.copy(X)
             XX = np.array(["T" if x > v else "F" for x in XX])
             ig = DS.information_gain(Y, XX, D)
             if ig > g:
                 th = v
                 g = ig
     except TypeError:
         return -float('Inf'), -1
     #########################################
     return th, g
コード例 #7
0
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        cp = DT.cutting_points(X, Y)
        if type(cp) == type(np.array([1])):
            g = -1
            th = float('-inf')
            for i in cp:
                a = (np.ma.masked_where(X > i, X)).mask
                if DS.information_gain(Y, a, D) > g:
                    g = DS.information_gain(Y, a, D)
                    th = i

        else:
            g = -1
            th = float('-inf')

        #########################################
        return th, g
コード例 #8
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance.
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        ys = []
        for tree in T:
            ys.append(DT.inference(tree, x))
        d = {}
        for item in ys:
            if item in d.keys():
                d[item] += 1
            else:
                d[item] = 1
        y = sorted(d, key=lambda x: d[x])[-1]

        #########################################
        return y
コード例 #9
0
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted.
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar.
                g: the weighted information gain by using the best threhold, a float scalar.
        '''
        #########################################
        # INSERT YOUR CODE HERE
        # Find a list of possible cutting points
        threshold_potential = DT.cutting_points(X, Y)

        # Check for input of threhodl potential
        if np.all(threshold_potential == -np.inf):
            return -float('inf'), -1

        # Getting infomation_gain list
        info_list = []
        for threshold in np.nditer(threshold_potential):
            info_list.append(DS.information_gain(Y, X >= threshold, D))

        # Getting the best threshold and information gain using best threshold
        g = max(info_list)
        th = threshold_potential[np.argmax(info_list)]

        #########################################
        return th, g
コード例 #10
0
    def load_dataset(filename='data3.csv'):
        '''
            Load dataset 3 from the CSV file:data3.csv. 
            The first row of the file is the header (including the names of the attributes)
            In the remaining rows, each row represents one data instance.
            The first column of the file is the label to be predicted.
            In remaining columns, each column represents an attribute.
            Input:
                filename: the filename of the dataset, a string.
            Output:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element is a float number
                   Here n is the number data instances in the dataset, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element is an integer
            Hint: you could use np.loadtxt()
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        X, Y = DT.load_dataset(filename)
        Y = Y.astype(int)

        #########################################
        return X,Y
コード例 #11
0
    def train(self, X, Y, n_tree=11):
        '''
            Given a training set, train a bagging ensemble of decision trees. 
            Input:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element can be int/float/string.
                   Here n is the number data instances in the training set, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
                n_tree: the number of trees in the ensemble
            Output:
                T: a list of the root of each tree, a list of length n_tree.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        T = []
        for _ in range(n_tree):
            #X, Y = Bag.bootstrap(X,Y)
            t = DT().train(X, Y)
            T.append(t)
        # Could also import Node and do like this
        '''for _ in range(n_tree):
        t = Node(X,Y)
        DT().build_tree(t)
        T.append(t)'''

        #########################################
        return T
コード例 #12
0
    def inference(T,x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        tmp = []
        for t in T:
            tmp.append(DT.inference(t, x))

        stat = dict()
        for i in tmp:
            if i not in stat:
                stat[i] = 1
            else:
                stat[i] += 1
        num = 0
        for sta in stat:
            if stat[sta] > num:
                num = stat[sta]
                y = sta
        #########################################
        return y
コード例 #13
0
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        y_list = []
        b = 0
        for i in T:
            y_list.append(DT.inference(i, x))
        for i in set(y_list):
            a = 0
            for j in range(len(A)):
                if y_list[j] == i:
                    a += A[j]
            if a > b:
                b = a
                y = i

        #########################################
        return y
コード例 #14
0
ファイル: boosting.py プロジェクト: NTCHANG1289/ML_coursework
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y)
        t.p = DS.most_common(Y, D)
        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(Y) or DT.stop2(X):
            t.isleaf = True
            return t

        # find the best attribute to split
        t.i, t.th = self.best_attribute(X, Y, D)

        # configure each child node
        t.C1, t.C2 = self.split(t.X, t.Y, t.i, t.th)

        D1, D2 = [], []
        for j in range(len(D)):
            if X[t.i, j] < t.th:
                D1.append(D[j])
            else:
                D2.append(D[j])
        D1 = np.array(D1)
        D2 = np.array(D2)

        t.C1.p = DS.most_common(t.C1.Y, D1)
        t.C2.p = DS.most_common(t.C2.Y, D2)

        t.C1.isleaf = True
        t.C2.isleaf = True

        #########################################
        return t
コード例 #15
0
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y)
        t.p = DS.most_common(t.Y, D)

        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(t.Y) == False and DT.stop2(t.X) == False:
            t.i, t.th = DS().best_attribute(t.X, t.Y, D)
            t.C1, t.C2 = DT.split(t.X, t.Y, t.i, t.th)
            d1 = D[np.where(X[t.i] < t.th)]
            d2 = D[np.where(X[t.i] >= t.th)]
            t.C1.p = DS.most_common(t.C1.Y, d1)
            t.C2.p = DS.most_common(t.C2.Y, d2)
            t.C1.isleaf = True
            t.C2.isleaf = True

        else:
            t.isleaf = True

        # find the best attribute to split

        # configure each child node

        #########################################
        return t
コード例 #16
0
    def train(self, X, Y, n_tree=11):
        '''
            Given a training set, train a bagging ensemble of decision trees.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n.
                   Each element can be int/float/string.
                   Here n is the number data instances in the training set, p is the number of attributes.
                Y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
                n_tree: the number of trees in the ensemble
            Output:
                T: a list of the root of each tree, a list of length n_tree.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        T = []
        for tree in range(n_tree):
            X1, Y1 = Bag.bootstrap(X, Y)
            d = DT()
            T.append(d.train(X1, Y1))

        #########################################
        return T
コード例 #17
0
ファイル: problem5.py プロジェクト: swqs1989/CS539
    def build_tree(self, X, Y, D):
        '''
            build decision stump by overwritting the build_tree function in DT class.
            Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes.
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the weights of instances, a numpy float vector of length n
            Return:
                t: the root node of the decision stump. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = Node(X, Y, isleaf=False)
        t.p = DS.most_common(Y, D)
        # if Condition 1 or 2 holds, stop splitting
        if DT.stop1(Y) or DT.stop2(X):
            t.isleaf = True
            return t

        # find the best attribute to split
        t.i, t.th = self.best_attribute(X, Y, D)

        # configure each child node
        t.C1 = Node(X[:, X[t.i, :] < t.th],
                    Y[X[t.i, :] < t.th],
                    isleaf=True,
                    p=DS.most_common(Y[X[t.i, :] < t.th], D[X[t.i, :] < t.th]))
        t.C2 = Node(X[:, X[t.i, :] >= t.th],
                    Y[X[t.i, :] >= t.th],
                    isleaf=True,
                    p=DS.most_common(Y[X[t.i, :] >= t.th],
                                     D[X[t.i, :] >= t.th]))
        #########################################
        return t
コード例 #18
0
    def inference(x, T, A):
        '''
            Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                x: the attribute vector of a data instance, a numpy vectr of shape p.
                   Each attribute value can be int/float
                T:  the root nodes of decision stumps, a list of length n_tree. 
                A: the weights of the decision stumps, a numpy float vector of length n_tree.
            Output:
                y: the class label, a scalar of int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        Y = []
        Y = [DT.inference(t, x) for t in T]
        y = DS.most_common(Y, A)

        #########################################
        return y
コード例 #19
0
ファイル: Bootstrap.py プロジェクト: jinalj07/Decision-Trees
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infering the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''

        Y = []

        for t in T:
            inference = DT.inference(t, x)
            Y.append(inference)
        y = max(set(Y), key=Y.count)

        return y
コード例 #20
0
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        list_y = []
        for t in T:
            list_y.append(DT.inference(t, x))
        y = max(list_y, key=list_y.count)

        #########################################
        return y
コード例 #21
0
ファイル: problem3.py プロジェクト: yyaaa1/MLHW2
    def inference(T, x):
        '''
            Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. 
            Input:
                T: a list of decision trees.
                x: the attribute vector, a numpy vectr of shape p.
                   Each attribute value can be int/float
            Output:
                y: the class labels, a numpy array of length n.
                   Each element can be int/float/string.
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        y = []
        for i in T:
            y.append(DT.inference(i, x))
        y = np.unique(y, return_counts=True)[0][np.argmax(
            np.unique(y, return_counts=True)[1])]

        #########################################
        return y
コード例 #22
0
ファイル: boosting.py プロジェクト: NTCHANG1289/ML_coursework
    def best_threshold(X, Y, D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n
            Output:
            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''
        #########################################
        ## INSERT YOUR CODE HERE

        cp = DT.cutting_points(X, Y)
        th = g = -1

        if type(cp) == float:
            return -float('Inf'), -1

        for c in cp:
            helper = []
            for x in X:
                if x > c:
                    helper.append('L')
                else:
                    helper.append('S')
            # print(DS.entropy(Y, D), DS.conditional_entropy(Y, helper, D))
            # print(helper)
            helper = np.asarray(helper)
            gg = DS.information_gain(Y, helper, D)

            if gg > g:
                th = c
                g = gg
        print(th, g)

        #########################################
        return th, g
コード例 #23
0
ファイル: AdaBoost.py プロジェクト: jinalj07/Decision-Trees
    def best_threshold(X,Y,D):
        '''
            Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. 
            Input:
                X: a list of values, a numpy array of int/float values.
                Y: a list of values, a numpy array of int/float/string values.
                D: the weights of instances, a numpy float vector of length n

            Output:
                th: the best threhold, a float scalar. 
                g: the weighted information gain by using the best threhold, a float scalar. 
        '''

        ig = lambda X,Y,threshold,D: DS.information_gain(Y,X>=threshold,D)
        ths = DT.cutting_points(X,Y)
    
        if np.all(ths == -np.inf):
            return -float('inf'),-1
        gs = [ig(X,Y,i,D) for i in ths]
        g = max(gs)
        th = ths[np.argmax(gs)]

        return th,g 
コード例 #24
0
    def step(X, Y, D):
        '''
            Compute one step of Boosting.  
            Input:
                X: the feature matrix, a numpy matrix of shape p by n. 
                   Each element can be int/float/string.
                   Here n is the number data instances in the node, p is the number of attributes.
                Y: the class labels, a numpy array of length n. Each element can be int/float/string.
                D: the current weights of instances, a numpy float vector of length n
            Output:
                t:  the root node of a decision stump trained in this step
                a: (alpha) the weight of the decision stump, a float scalar.
                D: the new weights of instances, a numpy float vector of length n
        '''
        #########################################
        ## INSERT YOUR CODE HERE
        t = DS().build_tree(X, Y, D)
        Y_ = DT.predict(t, X)
        e = AB.weighted_error_rate(Y, Y_, D)
        a = AB.compute_alpha(e)
        D = AB.update_D(D, a, Y, Y_)

        #########################################
        return t, a, D