def spamTest():
    docList = []
    classList = []
    fullText = []

    # read the mail
    for i in range(1, 26):
        wordlist1 = textParse(open('./email/spam/%d.txt' % i).read())
        docList.append(wordlist1)
        fullText.extend(docList)
        classList.append(1)
        wordlist0 = textParse(open('./email/ham/%d.txt' % i).read())
        docList.append(wordlist0)
        fullText.extend(docList)
        classList.append(0)

    # get the dictionary
    vablist = bayes.createVocablist(docList)

    # Random Test dateset
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []

    # Get the train dateset
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses)

    # test the bayes
    errorCount = 0
    for docIndex in testSet:
        testVec = bayes.setOfwords2Vec(vablist, docList[docIndex])
        result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa)
        if result != classList[docIndex]:
            errorCount += 1
    errorrate = float(errorCount) / len(testSet)
    print "the filter spam mail error rate is %f" % errorrate
def spamTest():
    docList = []
    classList = []
    fullText = []

    # read the mail
    for i in range(1,26):
        wordlist1 = textParse(open('./email/spam/%d.txt' %i).read())
        docList.append(wordlist1)
        fullText.extend(docList)
        classList.append(1)
        wordlist0 = textParse(open('./email/ham/%d.txt' %i).read())
        docList.append(wordlist0)
        fullText.extend(docList)
        classList.append(0)

    # get the dictionary
    vablist = bayes.createVocablist(docList)

    # Random Test dateset
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []

    # Get the train dateset
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses)

    # test the bayes
    errorCount = 0
    for docIndex in testSet:
        testVec = bayes.setOfwords2Vec(vablist, docList[docIndex])
        result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa)
        if result != classList[docIndex]:
            errorCount += 1
    errorrate = float(errorCount) / len(testSet)
    print "the filter spam mail error rate is %f" %errorrate
Exemple #3
0
    email  :  [email protected]
"""

from numpy import *
import bayes
import FilterMail

postingList, classVec = bayes.loadDataSet()

# get the vablist
vablist = bayes.createVocablist(postingList)
print "Show my vablist\n", vablist
print "-------------------------------"

# get the returnVec
returnVec = bayes.setOfwords2Vec(vablist, ["my", "love", "dog", "happy", "daddy"])
print "the word vec is ", returnVec
print "-------------------------------"
# get the prior probability
trainMat = []
for one in postingList:
    trainMat.append(bayes.setOfwords2Vec(vablist, one))

pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, classVec)
print "the 1 probability is %f, " % pa
print "the each class , each element probability\n", p1Vec, '\n', p0Vec


print "--------------------------------"
bayes.testNB()