def baggedDecTree(depth, dataSet):
    startTime = time.time()
    baseDir = "/N/u/hydargah/BigRed2/ml/"

    global liFeatures
    global trainVectors
    global testVectors
    global lenLiFeatures
    global liFeaturesDict

    dataSet = int(dataSet)
    liFeatures, trainVectors, testVectors = main2.buildDataVectors(
        dataSet, main2.FeatureSelection.InformationGain)

    baggedTrainVectors = createBag(trainVectors)
    tree = createTree(baggedTrainVectors, depth)

    #This directory will store all the bagged
    bagTreeDir = baseDir + "baggedTrees/"
    outputFile = open(baseDir + "baggedTrees.op", "a")
    randomNumber = random.randrange(0, 50, 1)
    treePickleFile = open(
        bagTreeDir + str(randomNumber) + "-" + str(dataSet) + "Tree" +
        str(depth) + ".pickle", "wb")
    pickle.dump(tree, treePickleFile)
    treePickleFile.close()
    outputFile.write("Tree with depth " + str(depth) + " written in time " +
                     str(time.time() - startTime) + "\n")
    outputFile.close()
def baggedDecTree(depth, dataSet):
	startTime = time.time()
	baseDir="/N/u/hydargah/BigRed2/ml/"

	global liFeatures
	global trainVectors
	global testVectors
	global lenLiFeatures
	global liFeaturesDict

	dataSet = int(dataSet)
	liFeatures, trainVectors, testVectors = main2.buildDataVectors(dataSet, main2.FeatureSelection.InformationGain)

	baggedTrainVectors = createBag(trainVectors)
	tree = createTree(baggedTrainVectors,depth)

	#This directory will store all the bagged 
	bagTreeDir = baseDir+"baggedTrees/"
	outputFile = open(baseDir+"baggedTrees.op","a")
	randomNumber = random.randrange(0,50,1)
	treePickleFile = open(bagTreeDir+str(randomNumber)+"-"+str(dataSet)+"Tree"+str(depth)+".pickle","wb")
	pickle.dump(tree, treePickleFile)
	treePickleFile.close()
	outputFile.write("Tree with depth "+str(depth)+" written in time "+str(time.time()-startTime)+"\n")
	outputFile.close()
def runDecTree(dataSet, featureSelectionMethod):

    startTime = time.time()
    baseDir = "/N/u/hydargah/BigRed2/ml/"

    ds = dataSet
    fs = featureSelectionMethod

    global liFeatures
    global trainVectors
    global testVectors
    global lenLiFeatures
    global liFeaturesDict

    liFeatures, trainVectors, testVectors = main2.buildDataVectors(
        ds, main2.FeatureSelection.InformationGain)

    depthAccuracyTPTN = []

    for d in range(0, 1):
        tree = createTree(trainVectors, depth=d)
        results = classifyNewSample(tree, testVectors[:20], depth=d)
        accuracy = resultsToAccuracy(results)
        tptnRates = TPTNRates(results)
        depthAccuracyTPTN.append((d, accuracy, tptnRates))
    print ""
    print ""
    print ""
    print "Depth\t Accuracy\t TP\t TN\t FP\t FN"
    for row in depthAccuracyTPTN:
        print str(row[0]) + "\t " + str(row[1]) + "\t " + str(
            row[2][0]) + "\t " + str(row[2][1]) + "\t " + str(
                row[2][2]) + "\t " + str(row[2][3])

    totalTime = time.time() - startTime
    print "Total Runtime = " + str(totalTime)
    '''
	print countOccurenceOfClassLabel(trainVectors)
	print calcEntropy(trainVectors)
	temp = liFeatures[53]
	subDataSet1, subDataSet2 = splitData(trainVectors, temp)
	
	print "Len of subdataset1"
	print str(len(subDataSet1))
	print "Len of subDataSet2 "
	print str(len(subDataSet2))

	print "Entropy of subdataset 1 : "
	print calcEntropy(subDataSet1)
	print "Entropy of subDataSet1 2 : "
	print calcEntropy(subDataSet2)
	'''
    '''	
def runDecTree(dataSet, featureSelectionMethod):
	
	startTime = time.time()
	baseDir="/N/u/hydargah/BigRed2/ml/"

	ds = dataSet
	fs = featureSelectionMethod

	global liFeatures
	global trainVectors
	global testVectors
	global lenLiFeatures
	global liFeaturesDict

	liFeatures, trainVectors, testVectors = main2.buildDataVectors(ds, main2.FeatureSelection.InformationGain)
	
	depthAccuracyTPTN = []

	for d in range(0,1):
		tree = createTree(trainVectors,depth = d)
		results = classifyNewSample(tree, testVectors[:20], depth = d)
		accuracy = resultsToAccuracy(results)
		tptnRates = TPTNRates(results)
		depthAccuracyTPTN.append((d, accuracy, tptnRates))
	print ""
	print ""
	print ""
	print "Depth\t Accuracy\t TP\t TN\t FP\t FN"
	for row in depthAccuracyTPTN:
		print str(row[0])+"\t "+str(row[1])+"\t "+str(row[2][0])+"\t "+str(row[2][1])+"\t "+str(row[2][2])+"\t "+str(row[2][3])

	totalTime = time.time() - startTime
	print "Total Runtime = "+str(totalTime)		 

	'''
	print countOccurenceOfClassLabel(trainVectors)
	print calcEntropy(trainVectors)
	temp = liFeatures[53]
	subDataSet1, subDataSet2 = splitData(trainVectors, temp)
	
	print "Len of subdataset1"
	print str(len(subDataSet1))
	print "Len of subDataSet2 "
	print str(len(subDataSet2))

	print "Entropy of subdataset 1 : "
	print calcEntropy(subDataSet1)
	print "Entropy of subDataSet1 2 : "
	print calcEntropy(subDataSet2)
	'''

	'''	
def main():
	baseDir = ""
	listOfTrees = []
	baggedDir = baseDir+"baggedTrees/"
	treeFileList = os.listdir(baggedDir)

	#Load all the bagged trees into main memory
	for f in treeFileList:
		if "Tree20" in f:
			pickleFile = open(baggedDir+f,"rb")
			tree = pickle.load(pickleFile)
			listOfTrees.append(tree)

	#no use for liFeatures and trainVectors	
	liFeatures, trainVectors, testVectors = main2.buildDataVectors(1, main2.FeatureSelection.InformationGain)	

	biglist = []
	tree = listOfTrees[6]
	for vec in testVectors:
		result = decTree.classifyNewSample(tree, [vec], depth=20)
		biglist.append(result[0])

	accuracy = decTree.resultsToAccuracy(biglist)
	tptnRates = decTree.TPTNRates(biglist)
	
	print "Results for a single tree "	
	print "Accuracy\t TP\t TN\t FP\t FN"
	print str(accuracy)+"\t"+str(tptnRates[0])+"\t"+str(tptnRates[1])+"\t"+str(tptnRates[2])+"\t"+str(tptnRates[3])

	allPredictions =[]

	for i in range(0,len(testVectors)):
		vecPrediction = []	

		for j in range(0,9):
			result = decTree.classifyNewSample(listOfTrees[j], [testVectors[i]], depth=20)
			vecPrediction.append(result[0])	
		
		countPos = 0
		countNeg = 0
		expectedValue = None
		for result in vecPrediction:
			if result[0] == "POSITIVE":
				countPos = countPos+1
			elif result[0] == "NEGATIVE":
				countNeg = countNeg+1
			expectedValue = result[1]

		prediction = None

		if countPos == countNeg:
			prediction = coinToss()				
		elif countPos > countNeg:
			prediction = "POSITIVE"
		else :
			prediction = "NEGATIVE"

		baggedPredictionPlusExpected = [prediction, expectedValue]
		allPredictions.append(baggedPredictionPlusExpected)
	
	accuracy = decTree.resultsToAccuracy(allPredictions)
	tptnRates = decTree.TPTNRates(allPredictions)
	print "Results for bagged Tree "
	print "Accuracy\t TP\t TN\t FP\t FN"
	print str(accuracy)+"\t"+str(tptnRates[0])+"\t"+str(tptnRates[1])+"\t"+str(tptnRates[2])+"\t"+str(tptnRates[3])
Exemple #6
0
def main():
    baseDir = ""
    listOfTrees = []
    baggedDir = baseDir + "baggedTrees/"
    treeFileList = os.listdir(baggedDir)

    #Load all the bagged trees into main memory
    for f in treeFileList:
        if "Tree20" in f:
            pickleFile = open(baggedDir + f, "rb")
            tree = pickle.load(pickleFile)
            listOfTrees.append(tree)

    #no use for liFeatures and trainVectors
    liFeatures, trainVectors, testVectors = main2.buildDataVectors(
        1, main2.FeatureSelection.InformationGain)

    biglist = []
    tree = listOfTrees[6]
    for vec in testVectors:
        result = decTree.classifyNewSample(tree, [vec], depth=20)
        biglist.append(result[0])

    accuracy = decTree.resultsToAccuracy(biglist)
    tptnRates = decTree.TPTNRates(biglist)

    print "Results for a single tree "
    print "Accuracy\t TP\t TN\t FP\t FN"
    print str(accuracy) + "\t" + str(tptnRates[0]) + "\t" + str(
        tptnRates[1]) + "\t" + str(tptnRates[2]) + "\t" + str(tptnRates[3])

    allPredictions = []

    for i in range(0, len(testVectors)):
        vecPrediction = []

        for j in range(0, 9):
            result = decTree.classifyNewSample(listOfTrees[j],
                                               [testVectors[i]],
                                               depth=20)
            vecPrediction.append(result[0])

        countPos = 0
        countNeg = 0
        expectedValue = None
        for result in vecPrediction:
            if result[0] == "POSITIVE":
                countPos = countPos + 1
            elif result[0] == "NEGATIVE":
                countNeg = countNeg + 1
            expectedValue = result[1]

        prediction = None

        if countPos == countNeg:
            prediction = coinToss()
        elif countPos > countNeg:
            prediction = "POSITIVE"
        else:
            prediction = "NEGATIVE"

        baggedPredictionPlusExpected = [prediction, expectedValue]
        allPredictions.append(baggedPredictionPlusExpected)

    accuracy = decTree.resultsToAccuracy(allPredictions)
    tptnRates = decTree.TPTNRates(allPredictions)
    print "Results for bagged Tree "
    print "Accuracy\t TP\t TN\t FP\t FN"
    print str(accuracy) + "\t" + str(tptnRates[0]) + "\t" + str(
        tptnRates[1]) + "\t" + str(tptnRates[2]) + "\t" + str(tptnRates[3])