def runClassification(trainingVocabList, fullData, fullClassVec): # split into training and test data trainingData = fullData trainingClassVec = fullClassVec TESTINGDATASIZE = 10 testingData = [] actualTestingVec = [] for index in range(0,TESTINGDATASIZE): import random i = int(random.uniform(0,len(trainingData))) testingData.append(trainingData[i]) actualTestingVec.append(trainingClassVec[i]) del(trainingData[i]) del(trainingClassVec[i]) (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec) topPC0 = [] topPC1 = [] for testData in testingData: testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData)) pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1 pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1 topPC0 = addUnique(topPC0, getTopN(trainingVocabList, pC0GivenData, 30)) # make a UNIQUE list of the most frequent words topPC1 = addUnique(topPC1, getTopN(trainingVocabList, pC1GivenData, 30)) # make a UNIQUE list of the most frequent words return getTopNFromList(topPC0,30), getTopNFromList(topPC1,30)
def runClassification(trainingData, trainingClassVec): # split training and test data TESTINGDATASIZE = 10 testingData = [] actualTestingVec = [] for index in range(0,TESTINGDATASIZE): import random i = int(random.uniform(0,len(trainingData))) testingData.append(trainingData[i]) actualTestingVec.append(trainingClassVec[i]) del(trainingData[i]) del(trainingClassVec[i]) trainingVocabList = naiveBayes.createVocabList(trainingData) (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec) predictedTestingVec = [] for testData in testingData: testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData)) pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1 pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1 if sum(np.log(pC0GivenData)) > sum(np.log(pC1GivenData)): predictedTestingVec.append(0) else: predictedTestingVec.append(1) i = 0 error = 0 misClassified = [] for predicted in predictedTestingVec: if (actualTestingVec[i] != predicted): error += 1 misClassified.append(testingData[i]) i += 1 if(DEBUG): print predictedTestingVec print actualTestingVec print 'num errors: %d' % error print 'misclassified:' print misClassified return float(error)/TESTINGDATASIZE