def test_compute_info_gain(self): # pass return # compute_info_gain(self, column, column_i, threshold, train_Y, parent_entropy): np.random.seed(10) column = np.array([10, 5, 2]) Y = np.array([0, 1, 1]) node = DecisionTreeNode(1, 3, set(Y)) print( node.compute_info_gain(column, threshold=3, train_Y=Y, parent_entropy=1))
def computeTree(r, currentAttributeToTree, listOfAllAttributeToTree, root, count): key_max = max(currentAttributeToTree.keys(), key=(lambda k: currentAttributeToTree[k].calculateChildrenGain(calculateT(r)))) root = currentAttributeToTree[key_max] for child in root.children: if child.yesToWillWait == 0 or child.noToWillWait == 0: r = filterRestuarant(root.data, r, child.data) if child.yesToWillWait > child.noToWillWait: child.children = [DecisionTreeNode("YES")] else: child.children = [DecisionTreeNode("NO")] # * SPECIAL CASE: current iteration missing attribute for child in listOfAllAttributeToTree[0][str(root.data) + "Tree"].children: if not any(child.data == u.data for u in root.children): # Iterates through previous iteration until current one is found for a in listOfAllAttributeToTree[::-1]: if child in a[str(root.data) + "Tree"].children: Node = DecisionTreeNode(child.data) if child.yesToWillWait >= child.noToWillWait: Node.children = [DecisionTreeNode("YES")] else: Node.children = [DecisionTreeNode("NO")] root.children.append(Node) break for child in root.children: if child.yesToWillWait != 0 and child.noToWillWait != 0: currentAttributeToTree = makeTree(r) key_max = max(currentAttributeToTree.keys(), key=(lambda k: currentAttributeToTree[k].calculateChildrenGain(calculateT(r)))) count += 1 listOfAllAttributeToTree.append(currentAttributeToTree) child.children = [computeTree(r, currentAttributeToTree, listOfAllAttributeToTree, child.children, count)] return root
def test_train(self): print('start test test_split_data_and_attrs') # def split_data_and_attrs(self, train_X, train_Y, attrs_ids, parent_entropy): train_X = np.array([[1, 2, 3, 1], [1, 3, 5, 8], [1, 2, 7, 5]]) train_Y = np.array([0, 0, 1]) attrs_ids = range(4) node = DecisionTreeNode(1, 3, set(train_Y)) left_train_X, left_train_Y, left_attrs_ids, right_train_X, right_train_Y, right_attrs_ids = \ node.split_data_and_attrs(train_X=train_X, train_Y=train_Y, attrs_ids=attrs_ids, parent_entropy=1) print('case 1:') node.train(train_X=train_X, train_Y=train_Y, attrs_ids=attrs_ids) print('--------------')
def build(self): for i in range(self.N): root = DecisionTreeNode() self.forest.append(DecisionTree(root)) data_blocks = self._random_data_blocks() for i in range(len(self.forest)): tree = self.forest[i] tree.data = data_blocks[i] tree.process_data(self.data) tree.build(tree.root, tree.data, 0)
def buildTreeFromRoot(restaurant, attribute): root = DecisionTreeRoot(attribute) for r in restaurant: if not any(child.data == r.mapToAttributeValue[attribute] for child in root.children): root.children.append(DecisionTreeNode(r.mapToAttributeValue[attribute])) for child in root.children: if child.data == r.mapToAttributeValue[attribute]: if r.willWait: child.addOneToYesWillWait() else: child.addOneToNoWillWait() return root
def test_majority(self): # pass return print('--------') print('case 1:') train_Y = np.array([5, 0, 0, 5, 0, 1]) # majority 0 node = DecisionTreeNode(1, 3, set(train_Y)) print(node.majority(train_Y)) print('--------') print('--------') print('case 2:') train_Y = np.array([5, 0, 5, 5, 0, 1]) # majority 5 node = DecisionTreeNode(1, 3, set(train_Y)) print(node.majority(train_Y)) print('--------') pass
def Generate_decision_tree(Dx,attribute_listx): issameclass=isSameClass(Dx) # print(Dx) # print(attribute_listx) if issameclass!=None: # print("Same class", len(Dx)) obj= DecisionTreeNode(issameclass) obj.status="issameclass" # print("base case id",obj.id, obj.label) return obj if len(attribute_listx)==0: # global retCount # retCount += 1 ret=getMajorityVoting(Dx) obj=DecisionTreeNode(ret) obj.status = "attribute length zero" # print("base case id", obj.id, obj.label) return obj D= copy.deepcopy(Dx) attribute_list=copy.deepcopy(attribute_listx) # print("here") splitting_attribute,infoGain, split_point = attribute_selection_method(D, attribute_list) # print(splitting_attribute) node=DecisionTreeNode(splitting_attribute) node.splitpoint=split_point # print(attribute_list) # print(splitting_attribute, len(Dx)) attribute_list.remove(splitting_attribute) if AttributeType[splitting_attribute]=="Categorical": DatabaseList,split_att_values=getPartitionsForCategorical(D,splitting_attribute) else: DatabaseList,split_att_values=getPartitionsForContinuous(D,splitting_attribute,split_point) #*** # print("attribute:", splitting_attribute, "info gain:", infoGain, "split point:", split_point) # for db in DatabaseList: # print(len(db)) idx=0 for partition in DatabaseList: # print(pruneThreshold) if len(partition) <= pruneThreshold: # if len(partition) == 0: # print("Partition length 0") ret=getMajorityVoting(Dx) childNode = DecisionTreeNode(ret) childNode.status="partition length zero" else: childNode = Generate_decision_tree(partition, attribute_list) if AttributeType[splitting_attribute]=="Categorical": edgeLabel[(node,childNode)] = split_att_values[idx] #*** idx += 1 # print("->",node.label, childNode.label) node.children.append(childNode) # edgeType[(node,childNode)]="less" # if len(D2) == 0: # childNode = getMajorityVoting(D) # else: # childNode = Generate_decision_tree(D2, attribute_list) # # edgeLable[splitting_attribute, childNode] = "val>%.2f" % split_point # node.children.append(childNode) # edgeType[(node,childNode)]="greater" return node
def test_split_data_and_attrs_optimized(self): # pass return train_X = np.array([[1, 2, 3], [1, 3, 5], [1, 2, 7]]) train_Y = np.array([0, 0, 1]) node = DecisionTreeNode(1, 3, set(train_Y)) left_train_X, left_train_Y, left_attrs_ids, right_train_X, right_train_Y, right_attrs_ids = \ node.split_data_and_attrs_optimized(train_X=train_X, train_Y=train_Y, attrs_ids=[0, 1, 2], parent_entropy=1) print('case 1:') print('left_train_X:') print(left_train_X) print(left_train_Y) print('\nright_train_X') print(right_train_X) print(right_train_Y) print('--------------') print('case 2:') train_X = np.array([[1, 1, 1], [1, 1, 0], [0, 0, 1], [1, 0, 0]]) train_Y = np.array([1, 1, 2, 2]) node = DecisionTreeNode(1, 3, set(train_Y)) left_train_X, left_train_Y, left_attrs_ids, right_train_X, right_train_Y, right_attrs_ids = \ node.split_data_and_attrs_optimized(train_X=train_X, train_Y=train_Y, attrs_ids=[0, 1, 2], parent_entropy=1) print('\nleft_train_X:') print(left_train_X) print(left_train_Y) print('\nright_train_X') print(right_train_X) print(right_train_Y) print('\nleft_attrs_ids:') print(left_attrs_ids) print('\nright_attrs_ids') print(right_attrs_ids) print('--------------') print('case 3:') train_X = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]) train_Y = np.array([1, 1, 1, 1]) node = DecisionTreeNode(1, 3, set(train_Y)) attrs_ids = [0, 2] left_train_X, left_train_Y, left_attrs_ids, right_train_X, right_train_Y, right_attrs_ids = \ node.split_data_and_attrs_optimized(train_X=train_X, train_Y=train_Y, attrs_ids=attrs_ids, parent_entropy=1) print('\nleft_train_X:') print(left_train_X) print(left_train_Y) print('\nright_train_X') print(right_train_X) print(right_train_Y) print('\nleft_attrs_ids:') print(left_attrs_ids) print('\nright_attrs_ids') print(right_attrs_ids) print('--------------') print('case 5:') train_X = np.array([[1, 2, 3], [1, 3, 5], [1, 2, 7]]) train_Y = np.array([1, 0, 1]) node = DecisionTreeNode(1, 3, set(train_Y)) left_train_X, left_train_Y, left_attrs_ids, right_train_X, right_train_Y, right_attrs_ids = \ node.split_data_and_attrs_optimized(train_X=train_X, train_Y=train_Y, attrs_ids=[0, 1, 2], parent_entropy=1) print('left_train_X:') print(left_train_X) print(left_train_Y) print('\nright_train_X') print(right_train_X) print(right_train_Y) print('--------------')
def test_compute_entropy(self): # pass return np.random.seed(10) Y = np.random.randint(0, 2, 100) node = DecisionTreeNode(1, 3, set(Y)) print(node.compute_entropy(Y))