예제 #1
0
def classifyNewSample(tree, testData, fileName, nominalColumns):

    predictionsPlusExpectedValues = []

    for row in testData:
        leaf = getRelevantLeafNode(tree, row, nominalColumns).leafValues

        predictedLabel = None
        currentPredictionPlusExpectedValues = []

        # Counting the occurences of each possible class label in the leaf
        labelCount = len(leaf)

        #if there is only one label then classify as that label
        if (labelCount == 1):
            predictedLabel = leaf.keys()

        #Else we count the number of occurences of each label and assign the label which has a greater number of occurences
        else:
            probabilityOfClassLabels = {}
            #Counting the total number of occurences of each label
            totalNumberOfLabels = 0
            for key in leaf.keys():
                totalNumberOfLabels += leaf[key]

            #Calculating and assigning the probability of each key to the dictionary probabilityOfClassLabels
            for key in leaf.keys():
                probabilityOfClassLabels[key] = float(
                    leaf[key]) / totalNumberOfLabels

            maxProbability = 0.0
            bestKey = None
            '''
            Getting the label with Max Probability, if 2 labels are equally probable then the selection
            depends on the order in which the keys are stored, which is generally random, because the dict in Python stores the dict in an unordered manner 
            2 runs of the program will never have keys in the same order. 
            '''
            for key in leaf.keys():
                if probabilityOfClassLabels[key] > maxProbability:
                    maxProbability = probabilityOfClassLabels[key]
                    bestKey = key
            predictedLabel = bestKey

        #Handles the case where the label is of the type list, this happens when there are multiple labels in one Node
        if (type(predictedLabel) == list):
            currentPredictionPlusExpectedValues.append(str(predictedLabel[0]))
        else:  # No issue when there is just one label per node
            currentPredictionPlusExpectedValues.append(str(predictedLabel))

        #appending the expected result from testData
        currentPredictionPlusExpectedValues.append(row[len(row) - 1])
        #List of lists containing the prediction vs expected values
        predictionsPlusExpectedValues.append(
            currentPredictionPlusExpectedValues)

    writeResult(predictionsPlusExpectedValues, fileName)
    return computeStats(predictionsPlusExpectedValues)
예제 #2
0
def classifyNewSample(tree, testData, fileName, nominalColumns):

    predictionsPlusExpectedValues = []

    for row in testData:
        leaf = getRelevantLeafNode(tree, row, nominalColumns).leafValues

        predictedLabel = None
        currentPredictionPlusExpectedValues = []
    
        # Counting the occurences of each possible class label in the leaf
        labelCount = len(leaf)

        #if there is only one label then classify as that label
        if(labelCount == 1):
            predictedLabel = leaf.keys()

        #Else we count the number of occurences of each label and assign the label which has a greater number of occurences
        else:
            probabilityOfClassLabels = {}
            #Counting the total number of occurences of each label
            totalNumberOfLabels = 0
            for key in leaf.keys():
                totalNumberOfLabels += leaf[key]

            #Calculating and assigning the probability of each key to the dictionary probabilityOfClassLabels
            for key in leaf.keys():
                probabilityOfClassLabels[key] = float(leaf[key])/totalNumberOfLabels

            maxProbability = 0.0
            bestKey = None

            '''
            Getting the label with Max Probability, if 2 labels are equally probable then the selection
            depends on the order in which the keys are stored, which is generally random, because the dict in Python stores the dict in an unordered manner 
            2 runs of the program will never have keys in the same order. 
            '''
            for key in leaf.keys():
                if probabilityOfClassLabels[key] > maxProbability:
                    maxProbability = probabilityOfClassLabels[key]
                    bestKey = key
            predictedLabel = bestKey

        #Handles the case where the label is of the type list, this happens when there are multiple labels in one Node
        if(type(predictedLabel) == list):
            currentPredictionPlusExpectedValues.append(str(predictedLabel[0]))
        else: # No issue when there is just one label per node 
            currentPredictionPlusExpectedValues.append(str(predictedLabel))

        #appending the expected result from testData
        currentPredictionPlusExpectedValues.append(row[len(row)-1])
        #List of lists containing the prediction vs expected values
        predictionsPlusExpectedValues.append(currentPredictionPlusExpectedValues)
 
    writeResult(predictionsPlusExpectedValues, fileName)
    return computeStats(predictionsPlusExpectedValues)
def classifyNewSample(tree, testData,depth,fileName, nominalColumns):
	
	predictionsPlusExpectedValues = []

	for row in testData:

		currentNode = tree
		leaf = None
		predictedLabel = None
		currentPredictionPlusExpectedValues = []

		#Handling the Special case of depth = 0 
		if(depth == 0):
			leaf = tree.leafValues
		else:
			#Recursively searching for the leaf node that martches the criteria
			while(leaf == None):
				if currentNode.col not in nominalColumns:
					#current node is a nominal column..      
					#print currentNode.col, currentNode.criteria
					if float(row[currentNode.col]) <= float(currentNode.criteria): 
					    currentNode = currentNode.rightBranch
					else:
					    currentNode = currentNode.leftBranch
				else:
					#current node is a continuous column..                    
					if row[currentNode.col] == currentNode.criteria: 
						currentNode = currentNode.rightBranch
					else:
						currentNode = currentNode.leftBranch
				leaf = currentNode.leafValues

		# Counting the occurences of each possible class label in the leaf
		labelCount = len(leaf)

		#if there is only one label then classify as that label
		if(labelCount == 1):
			predictedLabel = leaf.keys()
		
		#Else we count the number of occurences of each label and assign the label which has a greater number of occurences
		else:
			probabilityOfClassLabels = {}
			#Counting the total number of occurences of each label
			totalNumberOfLabels = 0
			for key in leaf.keys():
				totalNumberOfLabels += leaf[key]

			#Calculating and assigning the probability of each key to the dictionary probabilityOfClassLabels
			for key in leaf.keys():
				probabilityOfClassLabels[key] = float(leaf[key])/totalNumberOfLabels

			maxProbability = 0.0
			bestKey = None
		
			'''
			Getting the label with Max Probability, if 2 labels are equally probable then the selection
			depends on the order in which the keys are stored, which is generally random, because the dict in Python stores the dict in an unordered manner 
			2 runs of the program will never have keys in the same order. 
			'''
			for key in leaf.keys():
				if probabilityOfClassLabels[key] > maxProbability:
					maxProbability = probabilityOfClassLabels[key]
					bestKey = key
			predictedLabel = bestKey

		#Handles the case where the label is of the type list, this happens when there are multiple labels in one Node
		if(type(predictedLabel) == list):	
			currentPredictionPlusExpectedValues.append(str(predictedLabel[0]))
		else: # No issue when there is just one label per node 
			currentPredictionPlusExpectedValues.append(str(predictedLabel))

		#appending the expected result from testData	
		currentPredictionPlusExpectedValues.append(row[len(row)-1])
		#List of lists containing the prediction vs expected values
		predictionsPlusExpectedValues.append(currentPredictionPlusExpectedValues)
	 
	writeResult(predictionsPlusExpectedValues, depth, fileName)
	return computeStats(predictionsPlusExpectedValues)