def spamTest(): docList = [] classList = [] fullText = [] # read the mail for i in range(1, 26): wordlist1 = textParse(open('./email/spam/%d.txt' % i).read()) docList.append(wordlist1) fullText.extend(docList) classList.append(1) wordlist0 = textParse(open('./email/ham/%d.txt' % i).read()) docList.append(wordlist0) fullText.extend(docList) classList.append(0) # get the dictionary vablist = bayes.createVocablist(docList) # Random Test dateset trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] # Get the train dateset for docIndex in trainingSet: trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex])) trainClasses.append(classList[docIndex]) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses) # test the bayes errorCount = 0 for docIndex in testSet: testVec = bayes.setOfwords2Vec(vablist, docList[docIndex]) result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa) if result != classList[docIndex]: errorCount += 1 errorrate = float(errorCount) / len(testSet) print "the filter spam mail error rate is %f" % errorrate
def spamTest(): docList = [] classList = [] fullText = [] # read the mail for i in range(1,26): wordlist1 = textParse(open('./email/spam/%d.txt' %i).read()) docList.append(wordlist1) fullText.extend(docList) classList.append(1) wordlist0 = textParse(open('./email/ham/%d.txt' %i).read()) docList.append(wordlist0) fullText.extend(docList) classList.append(0) # get the dictionary vablist = bayes.createVocablist(docList) # Random Test dateset trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] # Get the train dateset for docIndex in trainingSet: trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex])) trainClasses.append(classList[docIndex]) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses) # test the bayes errorCount = 0 for docIndex in testSet: testVec = bayes.setOfwords2Vec(vablist, docList[docIndex]) result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa) if result != classList[docIndex]: errorCount += 1 errorrate = float(errorCount) / len(testSet) print "the filter spam mail error rate is %f" %errorrate
email : [email protected] """ from numpy import * import bayes import FilterMail postingList, classVec = bayes.loadDataSet() # get the vablist vablist = bayes.createVocablist(postingList) print "Show my vablist\n", vablist print "-------------------------------" # get the returnVec returnVec = bayes.setOfwords2Vec(vablist, ["my", "love", "dog", "happy", "daddy"]) print "the word vec is ", returnVec print "-------------------------------" # get the prior probability trainMat = [] for one in postingList: trainMat.append(bayes.setOfwords2Vec(vablist, one)) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, classVec) print "the 1 probability is %f, " % pa print "the each class , each element probability\n", p1Vec, '\n', p0Vec print "--------------------------------" bayes.testNB()