def test_dataset4(): ''' (2 points) test dataset4''' n= 400 X, Y = RF.load_dataset() assert X.shape == (16,400) assert Y.shape == (400,) d = DT() # train over half of the dataset t = d.train(X[:,::2],Y[::2]) # test on the other half Y_predict = DT.predict(t,X[:,1::2]) accuracy0 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a decision tree:', accuracy0) b = Bag() # train over half of the dataset T = b.train(X[:,::2],Y[::2],21) # test on the other half Y_predict = Bag.predict(T,X[:,1::2]) accuracy1 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a bagging of 21 trees:', accuracy1) r = RF() # train over half of the dataset T = r.train(X[:,::2],Y[::2],21) # test on the other half Y_predict = RF.predict(T,X[:,1::2]) accuracy2 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a random forest of 21 trees:', accuracy2) assert accuracy1 >= accuracy0 assert accuracy2 >= accuracy0 assert accuracy2 >= accuracy1-.05
def inference(x, T, A): ''' Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. Input: x: the attribute vector of a data instance, a numpy vectr of shape p. Each attribute value can be int/float T: the root nodes of decision stumps, a list of length n_tree. A: the weights of the decision stumps, a numpy float vector of length n_tree. Output: y: the class label, a scalar of int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE lab = np.array([]) for t in T: lab = np.append(lab, DT.inference(t, x)) stat = [] for i in range(len(lab)): stat.append([lab[i], A[i]]) d = dict() for i in range(len(lab)): if lab[i] not in d: d[lab[i]] = A[i] else: d[lab[i]] += A[i] y = DT.most_common(d) ######################################### return y
def build_tree(self, X,Y,D): ''' build decision stump by overwritting the build_tree function in DT class. Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. D: the weights of instances, a numpy float vector of length n Return: t: the root node of the decision stump. ''' ######################################### ## INSERT YOUR CODE HERE t = Node(X,Y) # if Condition 1 or 2 holds, stop splitting t.p = self.most_common(t.Y,D) if DT.stop1(t.Y) or DT.stop2(t.X): t.isleaf = True return t # find the best attribute to split t.i,t.th = self.best_attribute(t.X,t.Y,D) # configure each child node ind1 = [] ind2 = [] for j,x in enumerate(X[t.i,:]): if x < t.th: ind1.append(j) else: ind2.append(j) X1 = X[:,ind1] Y1 = Y[ind1] t.C1 = Node(X1,Y1,isleaf = True) D1 = D[ind1] s = float(sum(D1)) for i,w in enumerate(D[ind1]): D1[i] = float(w)/s t.C1.p = self.most_common(Y1,D1) X2 = X[:,ind2] Y2 = Y[ind2] t.C2 = Node(X2,Y2,isleaf = True) D2 = D[ind2] s = float(sum(D2)) for i,w in enumerate(D[ind2]): D2[i] = float(w)/s t.C2.p = self.most_common(Y2,D2) ######################################### return t
def best_threshold(X,Y,D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ######################################### ## INSERT YOUR CODE HERE cp = DT.cutting_points(X,Y) ig = [] cp = list(cp) for p in cp: newX = X.copy() for i,x in enumerate(newX): if x < p: newX[i] = 0 else: newX[i] = 1 ig.append(DS.information_gain(Y,newX,D)) g = max(ig) th = cp[ig.index(g)] if th == float('-inf'): g = -1 ######################################### return th,g
def inference(T, x): ''' Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE y = [] for t in T: y.append(DT.inference(t, x)) y = np.array(y) count = {} for yy in y: try: count[yy] += 1 except KeyError: count[yy] = 1 num = 0 for k, v in count.iteritems(): if v > num: num = v y = k ######################################### return y
def best_threshold(X, Y, D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ######################################### ## INSERT YOUR CODE HERE cp = DT.cutting_points(X, Y) th = -1 g = -1 try: for v in cp: XX = np.copy(X) XX = np.array(["T" if x > v else "F" for x in XX]) ig = DS.information_gain(Y, XX, D) if ig > g: th = v g = ig except TypeError: return -float('Inf'), -1 ######################################### return th, g
def best_threshold(X, Y, D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ######################################### ## INSERT YOUR CODE HERE cp = DT.cutting_points(X, Y) if type(cp) == type(np.array([1])): g = -1 th = float('-inf') for i in cp: a = (np.ma.masked_where(X > i, X)).mask if DS.information_gain(Y, a, D) > g: g = DS.information_gain(Y, a, D) th = i else: g = -1 th = float('-inf') ######################################### return th, g
def inference(T, x): ''' Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE ys = [] for tree in T: ys.append(DT.inference(tree, x)) d = {} for item in ys: if item in d.keys(): d[item] += 1 else: d[item] = 1 y = sorted(d, key=lambda x: d[x])[-1] ######################################### return y
def best_threshold(X, Y, D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ######################################### # INSERT YOUR CODE HERE # Find a list of possible cutting points threshold_potential = DT.cutting_points(X, Y) # Check for input of threhodl potential if np.all(threshold_potential == -np.inf): return -float('inf'), -1 # Getting infomation_gain list info_list = [] for threshold in np.nditer(threshold_potential): info_list.append(DS.information_gain(Y, X >= threshold, D)) # Getting the best threshold and information gain using best threshold g = max(info_list) th = threshold_potential[np.argmax(info_list)] ######################################### return th, g
def load_dataset(filename='data3.csv'): ''' Load dataset 3 from the CSV file:data3.csv. The first row of the file is the header (including the names of the attributes) In the remaining rows, each row represents one data instance. The first column of the file is the label to be predicted. In remaining columns, each column represents an attribute. Input: filename: the filename of the dataset, a string. Output: X: the feature matrix, a numpy matrix of shape p by n. Each element is a float number Here n is the number data instances in the dataset, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element is an integer Hint: you could use np.loadtxt() ''' ######################################### ## INSERT YOUR CODE HERE X, Y = DT.load_dataset(filename) Y = Y.astype(int) ######################################### return X,Y
def train(self, X, Y, n_tree=11): ''' Given a training set, train a bagging ensemble of decision trees. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the training set, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. n_tree: the number of trees in the ensemble Output: T: a list of the root of each tree, a list of length n_tree. ''' ######################################### ## INSERT YOUR CODE HERE T = [] for _ in range(n_tree): #X, Y = Bag.bootstrap(X,Y) t = DT().train(X, Y) T.append(t) # Could also import Node and do like this '''for _ in range(n_tree): t = Node(X,Y) DT().build_tree(t) T.append(t)''' ######################################### return T
def inference(T,x): ''' Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE tmp = [] for t in T: tmp.append(DT.inference(t, x)) stat = dict() for i in tmp: if i not in stat: stat[i] = 1 else: stat[i] += 1 num = 0 for sta in stat: if stat[sta] > num: num = stat[sta] y = sta ######################################### return y
def inference(x, T, A): ''' Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. Input: x: the attribute vector of a data instance, a numpy vectr of shape p. Each attribute value can be int/float T: the root nodes of decision stumps, a list of length n_tree. A: the weights of the decision stumps, a numpy float vector of length n_tree. Output: y: the class label, a scalar of int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE y_list = [] b = 0 for i in T: y_list.append(DT.inference(i, x)) for i in set(y_list): a = 0 for j in range(len(A)): if y_list[j] == i: a += A[j] if a > b: b = a y = i ######################################### return y
def build_tree(self, X, Y, D): ''' build decision stump by overwritting the build_tree function in DT class. Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. D: the weights of instances, a numpy float vector of length n Return: t: the root node of the decision stump. ''' ######################################### ## INSERT YOUR CODE HERE t = Node(X, Y) t.p = DS.most_common(Y, D) # if Condition 1 or 2 holds, stop splitting if DT.stop1(Y) or DT.stop2(X): t.isleaf = True return t # find the best attribute to split t.i, t.th = self.best_attribute(X, Y, D) # configure each child node t.C1, t.C2 = self.split(t.X, t.Y, t.i, t.th) D1, D2 = [], [] for j in range(len(D)): if X[t.i, j] < t.th: D1.append(D[j]) else: D2.append(D[j]) D1 = np.array(D1) D2 = np.array(D2) t.C1.p = DS.most_common(t.C1.Y, D1) t.C2.p = DS.most_common(t.C2.Y, D2) t.C1.isleaf = True t.C2.isleaf = True ######################################### return t
def build_tree(self, X, Y, D): ''' build decision stump by overwritting the build_tree function in DT class. Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. D: the weights of instances, a numpy float vector of length n Return: t: the root node of the decision stump. ''' ######################################### ## INSERT YOUR CODE HERE t = Node(X, Y) t.p = DS.most_common(t.Y, D) # if Condition 1 or 2 holds, stop splitting if DT.stop1(t.Y) == False and DT.stop2(t.X) == False: t.i, t.th = DS().best_attribute(t.X, t.Y, D) t.C1, t.C2 = DT.split(t.X, t.Y, t.i, t.th) d1 = D[np.where(X[t.i] < t.th)] d2 = D[np.where(X[t.i] >= t.th)] t.C1.p = DS.most_common(t.C1.Y, d1) t.C2.p = DS.most_common(t.C2.Y, d2) t.C1.isleaf = True t.C2.isleaf = True else: t.isleaf = True # find the best attribute to split # configure each child node ######################################### return t
def train(self, X, Y, n_tree=11): ''' Given a training set, train a bagging ensemble of decision trees. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the training set, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. n_tree: the number of trees in the ensemble Output: T: a list of the root of each tree, a list of length n_tree. ''' ######################################### ## INSERT YOUR CODE HERE T = [] for tree in range(n_tree): X1, Y1 = Bag.bootstrap(X, Y) d = DT() T.append(d.train(X1, Y1)) ######################################### return T
def build_tree(self, X, Y, D): ''' build decision stump by overwritting the build_tree function in DT class. Instead of building tree nodes recursively in DT, here we only build at most one level of children nodes. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. D: the weights of instances, a numpy float vector of length n Return: t: the root node of the decision stump. ''' ######################################### ## INSERT YOUR CODE HERE t = Node(X, Y, isleaf=False) t.p = DS.most_common(Y, D) # if Condition 1 or 2 holds, stop splitting if DT.stop1(Y) or DT.stop2(X): t.isleaf = True return t # find the best attribute to split t.i, t.th = self.best_attribute(X, Y, D) # configure each child node t.C1 = Node(X[:, X[t.i, :] < t.th], Y[X[t.i, :] < t.th], isleaf=True, p=DS.most_common(Y[X[t.i, :] < t.th], D[X[t.i, :] < t.th])) t.C2 = Node(X[:, X[t.i, :] >= t.th], Y[X[t.i, :] >= t.th], isleaf=True, p=DS.most_common(Y[X[t.i, :] >= t.th], D[X[t.i, :] >= t.th])) ######################################### return t
def inference(x, T, A): ''' Given an adaboost ensemble of decision trees and one data instance, infer the label of the instance. Input: x: the attribute vector of a data instance, a numpy vectr of shape p. Each attribute value can be int/float T: the root nodes of decision stumps, a list of length n_tree. A: the weights of the decision stumps, a numpy float vector of length n_tree. Output: y: the class label, a scalar of int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE Y = [] Y = [DT.inference(t, x) for t in T] y = DS.most_common(Y, A) ######################################### return y
def inference(T, x): ''' Given a bagging ensemble of decision trees and one data instance, infering the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' Y = [] for t in T: inference = DT.inference(t, x) Y.append(inference) y = max(set(Y), key=Y.count) return y
def inference(T, x): ''' Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE list_y = [] for t in T: list_y.append(DT.inference(t, x)) y = max(list_y, key=list_y.count) ######################################### return y
def inference(T, x): ''' Given a bagging ensemble of decision trees and one data instance, infer the label of the instance. Input: T: a list of decision trees. x: the attribute vector, a numpy vectr of shape p. Each attribute value can be int/float Output: y: the class labels, a numpy array of length n. Each element can be int/float/string. ''' ######################################### ## INSERT YOUR CODE HERE y = [] for i in T: y.append(DT.inference(i, x)) y = np.unique(y, return_counts=True)[0][np.argmax( np.unique(y, return_counts=True)[1])] ######################################### return y
def best_threshold(X, Y, D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ######################################### ## INSERT YOUR CODE HERE cp = DT.cutting_points(X, Y) th = g = -1 if type(cp) == float: return -float('Inf'), -1 for c in cp: helper = [] for x in X: if x > c: helper.append('L') else: helper.append('S') # print(DS.entropy(Y, D), DS.conditional_entropy(Y, helper, D)) # print(helper) helper = np.asarray(helper) gg = DS.information_gain(Y, helper, D) if gg > g: th = c g = gg print(th, g) ######################################### return th, g
def best_threshold(X,Y,D): ''' Find the best threshold among all possible cutting points in the continous attribute of X. The data instances are weighted. Input: X: a list of values, a numpy array of int/float values. Y: a list of values, a numpy array of int/float/string values. D: the weights of instances, a numpy float vector of length n Output: th: the best threhold, a float scalar. g: the weighted information gain by using the best threhold, a float scalar. ''' ig = lambda X,Y,threshold,D: DS.information_gain(Y,X>=threshold,D) ths = DT.cutting_points(X,Y) if np.all(ths == -np.inf): return -float('inf'),-1 gs = [ig(X,Y,i,D) for i in ths] g = max(gs) th = ths[np.argmax(gs)] return th,g
def step(X, Y, D): ''' Compute one step of Boosting. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. D: the current weights of instances, a numpy float vector of length n Output: t: the root node of a decision stump trained in this step a: (alpha) the weight of the decision stump, a float scalar. D: the new weights of instances, a numpy float vector of length n ''' ######################################### ## INSERT YOUR CODE HERE t = DS().build_tree(X, Y, D) Y_ = DT.predict(t, X) e = AB.weighted_error_rate(Y, Y_, D) a = AB.compute_alpha(e) D = AB.update_D(D, a, Y, Y_) ######################################### return t, a, D