def best_attribute(self, X, Y): ''' Find the best attribute to split the node. (Overwritting the best_attribute function in the parent class: DT). The attributes have continous values (int/float). Here only a random sample of m features are considered. m = floor(sqrt(p)). Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. Output: i: the index of the attribute to split, an integer scalar th: the threshold of the attribute to split, a float scalar ''' ######################################### ## INSERT YOUR CODE HERE p, n = X.shape m = math.floor(math.sqrt(p)) list_m = np.random.choice(p, m, replace=False) X1 = np.zeros((m, n)) dic = {} for i in range(len(list_m)): X1[i, :] = X[list_m[i], :] dic[i] = list_m[i] key, th = Bag().best_attribute(X1, Y) i = dic[key] ######################################### return i, th
def best_attribute(self, X, Y): ''' Find the best attribute to split the node. (Overwritting the best_attribute function in the parent class: DT). The attributes have continous values (int/float). Here only a random sample of m features are considered. m = floor(sqrt(p)). Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. Output: i: the index of the attribute to split, an integer scalar th: the threshold of the attribute to split, a float scalar ''' ######################################### # INSERT YOUR CODE HERE # Find the subset of attribute and the corresponding values p = X.shape[0] m = int(np.floor(np.sqrt(p))) subset_index = np.random.choice(p, m) # Find the best attribute for that particular subset i, th = Bag.best_attribute(self, X[subset_index], Y) if (th == -np.inf): return RF.best_attribute(self, X, Y) i = subset_index[i] ######################################### return i, th
def best_attribute(self, X, Y): ''' Finding the best attribute to split the node. (Overwritting the best_attribute function in the parent class: DT). The attributes have continous values (int/float). Here only a random sample of m features are considered. m = floor(sqrt(p)). Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the node, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. Output: i: the index of the attribute to split, an integer scalar th: the threshold of the attribute to split, a float scalar ''' m = int(np.floor(np.sqrt(X.shape[0]))) indices = np.random.choice(np.arange(X.shape[0]), m) i, th = Bag.best_attribute(self, X[indices], Y) if th == -np.inf: return RF.best_attribute(self, X, Y) return i, th
def test_dataset3(): ''' (2 points) test dataset3''' b = AB() X, Y = Bag.load_dataset() n = float(len(Y)) # train over half of the dataset T, A = b.train(X[:, ::2], Y[::2], 5) # test on the other half Y_predict = AB.predict(X[:, 1::2], T, A) accuracy = sum(Y[1::2] == Y_predict) / n * 2. print('test accuracy of an AdaBoost ensemble of 5 trees:', accuracy) assert accuracy >= .85 # train over half of the dataset T, A = b.train(X[:, ::2], Y[::2], 20) # test on the other half Y_predict = AB.predict(X[:, 1::2], T, A) accuracy = sum(Y[1::2] == Y_predict) / n * 2. print('test accuracy of an AdaBoost ensemble of 20 trees:', accuracy) assert accuracy >= .98
def load_dataset(filename='data4.csv'): ''' Load dataset 4 from the CSV file:data3.csv. The first row of the file is the header (including the names of the attributes) In the remaining rows, each row represents one data instance. The first column of the file is the label to be predicted. In remaining columns, each column represents an attribute. Input: filename: the filename of the dataset, a string. Output: X: the feature matrix, a numpy matrix of shape p by n. Each element is a float scalar. Here n is the number data instances in the dataset, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element is an integer. ''' ######################################### ## INSERT YOUR CODE HERE X, Y = Bag.load_dataset(filename) ######################################### return X, Y