Exemple #1
0
def main():

    # Check if the number of command line arguments is correct
    if len(sys.argv) < 6:
        print(
            "-- python --|-- main.py --|-- L --|-- K --|-- training_set.csv --|-- validation_set.csv --|-- test_set.csv --|"
        )
        sys.exit(1)

    # The program takes two integer L and K as input to prune the decision tree
    L = int(sys.argv[1])
    K = int(sys.argv[2])

    # Get the file path of the training data, validation data and test data
    dataDir = './data1/'
    training_set = dataDir + sys.argv[3]
    validation_set = dataDir + sys.argv[4]
    test_set = dataDir + sys.argv[5]

    # Build a decision tree on training data
    decisionTree = DecisionTree(training_set)

    #############################################
    # decisionTree.exportTree('tree.txt')
    # print decisionTree
    #############################################

    # Create a validator using test data to calculate the prediction accuracy of a given decision tree
    validator = Validator(test_set)

    # Calculate the prediction accuracy of the original decision tree on test data
    validator.calculateAccuracy(decisionTree.root)

    # Display the prediction accuracy before pruning
    print("\nA decision tree is fully grown to fit the training data.")
    validator.displayAccuracy()

    # Post pruning the decision tree
    print("\nPost prunning", '.' * 30)
    print("L =", L, ", K =", K, ", the pruned decision tree is:\n")

    # Prune the original decision tree using L, K and validation data as inputs
    decisionTree.pruneTree(L, K, validation_set)

    ##############################################
    # decisionTree.exportTree('pruned_tree.txt')
    ##############################################

    # print the decision tree to standard output
    print(decisionTree)  # Override the __str__ method in DecisionTree class

    # Calculate the prediction accuracy of the pruned decision tree on test data
    validator.calculateAccuracy(decisionTree.root)

    # Display the prediction accuracy after pruning
    validator.displayAccuracy()
Exemple #2
0
	def pruneTree(self, L, K, validation_set):

		"""Post prune the decision tree using two parameters L, K and the validation data.
        
		Args:
		    L              : The number of attempts of post pruning attempt.
		    K              : The seed to generate a random number of nodes to be pruned.
		    validation_set : The validation data for post pruning.

		Returns:
		        bestTree : The best decision tree after post pruning.

		"""

		# Let the best decision tree D_Best be the current decision tree
		bestTree = self.root

		# Create a validator using the validation set to test decision tree models
		validator = Validator(validation_set)
        
		for i in range(1, L + 1):
			
			# Copy tree D into a new tree D'
			currTree = copy.deepcopy(bestTree)

			M = random.randint(1, K) # A random number between 1 and K.

			for j in range(1, M + 1):

				# Let N denote the number of non-leaf nodes in the decision tree D'.
				# Order the nodes in D' from 1 to N.
				nonLeafNodes = self.order(currTree)
				N = len(nonLeafNodes) - 1

				# Terminate pruning if there is no non-leaf node (except for Root) in the tree
				if N <= 0:
					return bestTree
                
				# Let P be a random number between 1 and N.
				P = random.randint(1, N) 
                
				# Replace the subtree rooted at node P in D' by a leaf node
				# Assign the majority class of the subset of the data at P 
				# to the leaf node.
				replaceNode = nonLeafNodes[P]
				replaceNode.val = -1
				replaceNode.left = None
				replaceNode.right = None

			# Evaluate the accuracy of D' on the validation set
			oldAccuracy = validator.calculateAccuracy(bestTree)
			newAccuracy = validator.calculateAccuracy(currTree)
            
			# If D' is more accurate than D_Best, replace D_Best by D'
			if newAccuracy >= oldAccuracy:
			    bestTree = currTree

		# Update the decision tree to be D_Best and return D_Best 
		self.root = bestTree
		return bestTree
Exemple #3
0
    def pruneTree(self, decisionTree, L, K, validation_set):
        """Post prune the decision tree using two parameters L, K
        
		Function Arguments:
		    L              : The number of iterations of post pruning.
		    K              : The seed to generate a random number of nodes to be pruned.
		Returns:
		        Dbest : The post pruned tree.
		"""

        Dbest = decisionTree.root

        # Create a validator object against the validation set to decide if the pruned tree has higher accuracy
        # than the original tree
        validatorObj = Validator(validation_set)

        for i in range(0, L):
            Dcurr = copy.deepcopy(Dbest)

            # A random number generator between 1 and K.
            M = random.randint(1, K)

            for j in range(0, M):

                nonLeafNodes = self.bfsOrderedNodes(Dcurr)
                N = len(nonLeafNodes) - 1

                if N <= 0:
                    return Dbest

                # Let P be a random number generated between 1 and N.
                P = random.randint(1, N)

                nonLeafNodes[P].val = -1
                nonLeafNodes[P].left = None
                nonLeafNodes[P].right = None

            accurracyDold = validatorObj.calculateAccuracy(Dbest)
            accurracyDnew = validatorObj.calculateAccuracy(Dcurr)

            # If pruned tree accuracy is more than the previous tree,
            if accurracyDnew >= accurracyDold:
                Dbest = Dcurr

        decisionTree.root = Dbest
        return Dbest
Exemple #4
0
def main():

    if len(sys.argv) < 7:
        print(
            "There should be 6 arguments -- L K train.csv validate.csv test.csv yes/no"
        )
        sys.exit(1)

    L = int(sys.argv[1])
    K = int(sys.argv[2])

    training_set = sys.argv[3]
    validation_set = sys.argv[4]
    test_set = sys.argv[5]
    to_print = sys.argv[6]

    decisionTree = DecisionTree(training_set)
    if to_print == "yes":
        print(
            "Before Pruning: DecisionTree based on information Gain Heuristics"
        )
        print(decisionTree)

    validatorObj = Validator(test_set)
    print(
        "Before Pruning: Accuracy of Decision Tree based on information Gain Heuristics"
    )
    validatorObj.calculateAccuracy(decisionTree.root)
    validatorObj.printAccuracy()

    decisionTreeVarImp = DecisionTreeVarianceImp(training_set)
    if to_print == "yes":
        print(
            "Before Pruning: DecisionTree based on Variance Impurity Heuristics"
        )
        print(decisionTreeVarImp)

    validatorVar = Validator(test_set)
    print(
        "Before Pruning: Accuracy of DecisionTree based on Variance Impurity Heuristics"
    )
    validatorVar.calculateAccuracy(decisionTreeVarImp.root)
    validatorVar.printAccuracy()

    # Post pruning starts
    prune = PruneTree()

    prune.pruneTree(decisionTree, L, K, validation_set)
    if to_print == "yes":
        print(
            "After Pruning: DecisionTree based on information Gain Heuristics")
        print(decisionTree)

    validatorObj.calculateAccuracy(decisionTree.root)
    print(
        "After Pruning: Accuracy of DecisionTree based on information Gain Heuristics"
    )
    validatorObj.printAccuracy()

    prune = PruneTree()

    print("After Pruning: DecisionTree based on Variance Impurity Heuristics")
    prune.pruneTree(decisionTreeVarImp, L, K, validation_set)
    if to_print == "yes":
        print(decisionTreeVarImp)

    validatorObj.calculateAccuracy(decisionTreeVarImp.root)
    print(
        "After Pruning: Accuracy of DecisionTree based on Variance Impurity Heuristics"
    )
    validatorObj.printAccuracy()