def tests(): bayes.testingNB() bayes.spamTest()
# -*- coding: utf-8 -*- import bayes from numpy import * #过滤垃圾邮件 bayes.spamTest()
print(flagLab) #得到抽取特征后的文档,以及文档所属的类别 listOPosts,listClasses = bayes.loadDataSet() #构建一个包含所有词的列表 myVocabList = bayes.createVocabList(listOPosts) #词向量构成的列表 trainMat=[] for postinDoc in listOPosts: trainMat.append( bayes.setOfWords2Vec(myVocabList,postinDoc) ) #得到两个类别的概率向量,和侮辱性文档的概率 p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses) print(pAb) print(p0V) print(p1V) ''' #判断网站留言是否是恶意的 bayes.testingNB() #过滤垃圾邮件 errRaiosum=0.0 for i in range(10): errRaiosum += bayes.spamTest() print('平均错误率为:' +str( errRaiosum/10))
import bayes print bayes.spamTest()
# myVocabList = bayes.createVocabList(listOPosts) # print myVocabList # # print bayes.setOfWords2Vect(myVocabList, listOPosts[0]) # print bayes.setOfWords2Vect(myVocabList, listOPosts[3]) # # trainMat = map(lambda postinDoc: bayes.setOfWords2Vect(myVocabList, postinDoc), listOPosts) # print # for item in trainMat: # print item # # p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) # # print p0V # print p1V # print pAb # # testEntry = ['love', 'my', 'dalmation'] # thisDoc = np.array(bayes.setOfWords2Vect(myVocabList, testEntry)) # # print thisDoc # print testEntry, 'classified as:', bayes.classifyNB(thisDoc, p0V, p1V, pAb) # testEntry = ['stupid', 'garbage'] # thisDoc = np.array(bayes.setOfWords2Vect(myVocabList, testEntry)) # # print thisDoc # print testEntry, 'classified as:', bayes.classifyNB(thisDoc, p0V, p1V, pAb) result = 0 N = 1.0 for i in range(int(N)): result += bayes.spamTest() print(result / N)
# myVocabList=bayes.createVocabList(listOPosts) # print(myVocabList) # # print(len(myVocabList)) # # print(bayes.setOfWords2Vec(myVocabList,listOPosts[0])) # # print(bayes.setOfWords2Vec(myVocabList,['time'])) # # trainMat=[] # for postinDoc in listOPosts: # trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) # # print(trainMat) # p0V,p1V,pAb=bayes.trainNB0(trainMat,listClasses) # # print(p0C,p1V,pAb) # testEntry=['love','my','dalmation'] # thisDoc=np.array(bayes.setOfWords2Vec(myVocabList,testEntry)) # print(testEntry,'classified as:',bayes.classifyNB(thisDoc,p0V,p1V,pAb)) # testEntry=['stupid','garbage'] # thisDoc=np.array(bayes.setOfWords2Vec(myVocabList,testEntry)) # print(testEntry,'classified as:',bayes.classifyNB(thisDoc,p0V,p1V,pAb)) ##----------------------------使用朴素贝叶斯过滤垃圾邮件---------------------------------------- filepath = '/Users/songhaiyue/Desktop/B01_python/Machine_Learning_in_Action/machinelearninginaction/Ch04/' # filename=filepath+'email/ham/6.txt' # emailText=open(filename,encoding ='unicode_escape').read() # # print(emailText) # listOfTokens=re.split('\s',emailText) # print(listOfTokens) bayes.spamTest(filepath)
import bayes total=0.0 reload(bayes) for i in range(1,20): total+=bayes.spamTest() print total/100
for docIndex in trainingSet: trainMat.append(setOfWord2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 #3.(以下四行)对测试集分类 for docIndex in testSet: wordVector = sefOfWord2Vec(vocabList,docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ',float(errorCount)/len(testSet)) #测试 print(bayes.spamTest()) print(bayes.spamTest()) import feedparser ny = feedparser.parse('http://newyork.craiglist.org/stp/index.rss') ny['entries'] len(ny['entries']) #RSS源分类器及高频词去除函数 def calcMostFreq(vocabList,fullText): import operator freqDict = {} for token in vocabList: freqDict[token] = fullText.count(token) sortedFreq = sorted(freqDict.iteritems(),key = operator.itemgetter(1),reveser)
import bayes from numpy import * import re listOPosts, listClasses = bayes.loadDataSet() # 4.5 # myVocabList = bayes.createVocabList(listOPosts) # # print(myVocabList) # word2vec = bayes.setOfWords2Vec(myVocabList,listOPosts[0]) # trainMat = [] # for postinDoc in listOPosts: # trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) # p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses) # print(pAb) # print(p0V) # print(p1V) # bayes.testingNB() # 4.6 # mySent = '' # mySent.split() # regEx = re.compile('\\W*') # listOfTokens = regEx.split(mySent) error_rate = 0 for i in range(100): error_rate += bayes.spamTest() print("average error rate is:", float(error_rate / 100))