def main(): # Check if the number of command line arguments is correct if len(sys.argv) < 6: print( "-- python --|-- main.py --|-- L --|-- K --|-- training_set.csv --|-- validation_set.csv --|-- test_set.csv --|" ) sys.exit(1) # The program takes two integer L and K as input to prune the decision tree L = int(sys.argv[1]) K = int(sys.argv[2]) # Get the file path of the training data, validation data and test data dataDir = './data1/' training_set = dataDir + sys.argv[3] validation_set = dataDir + sys.argv[4] test_set = dataDir + sys.argv[5] # Build a decision tree on training data decisionTree = DecisionTree(training_set) ############################################# # decisionTree.exportTree('tree.txt') # print decisionTree ############################################# # Create a validator using test data to calculate the prediction accuracy of a given decision tree validator = Validator(test_set) # Calculate the prediction accuracy of the original decision tree on test data validator.calculateAccuracy(decisionTree.root) # Display the prediction accuracy before pruning print("\nA decision tree is fully grown to fit the training data.") validator.displayAccuracy() # Post pruning the decision tree print("\nPost prunning", '.' * 30) print("L =", L, ", K =", K, ", the pruned decision tree is:\n") # Prune the original decision tree using L, K and validation data as inputs decisionTree.pruneTree(L, K, validation_set) ############################################## # decisionTree.exportTree('pruned_tree.txt') ############################################## # print the decision tree to standard output print(decisionTree) # Override the __str__ method in DecisionTree class # Calculate the prediction accuracy of the pruned decision tree on test data validator.calculateAccuracy(decisionTree.root) # Display the prediction accuracy after pruning validator.displayAccuracy()
def pruneTree(self, L, K, validation_set): """Post prune the decision tree using two parameters L, K and the validation data. Args: L : The number of attempts of post pruning attempt. K : The seed to generate a random number of nodes to be pruned. validation_set : The validation data for post pruning. Returns: bestTree : The best decision tree after post pruning. """ # Let the best decision tree D_Best be the current decision tree bestTree = self.root # Create a validator using the validation set to test decision tree models validator = Validator(validation_set) for i in range(1, L + 1): # Copy tree D into a new tree D' currTree = copy.deepcopy(bestTree) M = random.randint(1, K) # A random number between 1 and K. for j in range(1, M + 1): # Let N denote the number of non-leaf nodes in the decision tree D'. # Order the nodes in D' from 1 to N. nonLeafNodes = self.order(currTree) N = len(nonLeafNodes) - 1 # Terminate pruning if there is no non-leaf node (except for Root) in the tree if N <= 0: return bestTree # Let P be a random number between 1 and N. P = random.randint(1, N) # Replace the subtree rooted at node P in D' by a leaf node # Assign the majority class of the subset of the data at P # to the leaf node. replaceNode = nonLeafNodes[P] replaceNode.val = -1 replaceNode.left = None replaceNode.right = None # Evaluate the accuracy of D' on the validation set oldAccuracy = validator.calculateAccuracy(bestTree) newAccuracy = validator.calculateAccuracy(currTree) # If D' is more accurate than D_Best, replace D_Best by D' if newAccuracy >= oldAccuracy: bestTree = currTree # Update the decision tree to be D_Best and return D_Best self.root = bestTree return bestTree
def pruneTree(self, decisionTree, L, K, validation_set): """Post prune the decision tree using two parameters L, K Function Arguments: L : The number of iterations of post pruning. K : The seed to generate a random number of nodes to be pruned. Returns: Dbest : The post pruned tree. """ Dbest = decisionTree.root # Create a validator object against the validation set to decide if the pruned tree has higher accuracy # than the original tree validatorObj = Validator(validation_set) for i in range(0, L): Dcurr = copy.deepcopy(Dbest) # A random number generator between 1 and K. M = random.randint(1, K) for j in range(0, M): nonLeafNodes = self.bfsOrderedNodes(Dcurr) N = len(nonLeafNodes) - 1 if N <= 0: return Dbest # Let P be a random number generated between 1 and N. P = random.randint(1, N) nonLeafNodes[P].val = -1 nonLeafNodes[P].left = None nonLeafNodes[P].right = None accurracyDold = validatorObj.calculateAccuracy(Dbest) accurracyDnew = validatorObj.calculateAccuracy(Dcurr) # If pruned tree accuracy is more than the previous tree, if accurracyDnew >= accurracyDold: Dbest = Dcurr decisionTree.root = Dbest return Dbest
def main(): if len(sys.argv) < 7: print( "There should be 6 arguments -- L K train.csv validate.csv test.csv yes/no" ) sys.exit(1) L = int(sys.argv[1]) K = int(sys.argv[2]) training_set = sys.argv[3] validation_set = sys.argv[4] test_set = sys.argv[5] to_print = sys.argv[6] decisionTree = DecisionTree(training_set) if to_print == "yes": print( "Before Pruning: DecisionTree based on information Gain Heuristics" ) print(decisionTree) validatorObj = Validator(test_set) print( "Before Pruning: Accuracy of Decision Tree based on information Gain Heuristics" ) validatorObj.calculateAccuracy(decisionTree.root) validatorObj.printAccuracy() decisionTreeVarImp = DecisionTreeVarianceImp(training_set) if to_print == "yes": print( "Before Pruning: DecisionTree based on Variance Impurity Heuristics" ) print(decisionTreeVarImp) validatorVar = Validator(test_set) print( "Before Pruning: Accuracy of DecisionTree based on Variance Impurity Heuristics" ) validatorVar.calculateAccuracy(decisionTreeVarImp.root) validatorVar.printAccuracy() # Post pruning starts prune = PruneTree() prune.pruneTree(decisionTree, L, K, validation_set) if to_print == "yes": print( "After Pruning: DecisionTree based on information Gain Heuristics") print(decisionTree) validatorObj.calculateAccuracy(decisionTree.root) print( "After Pruning: Accuracy of DecisionTree based on information Gain Heuristics" ) validatorObj.printAccuracy() prune = PruneTree() print("After Pruning: DecisionTree based on Variance Impurity Heuristics") prune.pruneTree(decisionTreeVarImp, L, K, validation_set) if to_print == "yes": print(decisionTreeVarImp) validatorObj.calculateAccuracy(decisionTreeVarImp.root) print( "After Pruning: Accuracy of DecisionTree based on Variance Impurity Heuristics" ) validatorObj.printAccuracy()