def spamTest(): hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham") hamclassList = [0] * len(hamemail) spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam") spamclassList = [1] * len(spamemail) Allemail = [] Allemail.extend(hamemail) Allemail.extend(spamemail) AllList = [] AllList.extend(hamclassList) AllList.extend(spamclassList) VocalbList = Bayes.createVocabList(Allemail) # print(VocalbList) testMat = [] realclass = [] for i in range(10): randIndex = int(random.uniform(0, len(Allemail))) testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex])) del (Allemail[randIndex]) realclass.append(AllList[randIndex]) del (AllList[randIndex]) trainMat = [] for i in range(len(Allemail)): trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i])) p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList) # print(p0vect,'\n',p1vect,'\n',pA) for i in range(10): print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA), ",real_result=", realclass[i])
def localWords(feed1, feed0): docList = [] #以二维数组形式存储所有样本的词汇表 classList = [] #存储所有样本的类别信息 fullText = [] #以一维数组形式存储所有样本的词汇表 minLen = min(len(feed1['entries']), len(feed0['entries'])) #获取两个RSS源的最小长度 for i in range(minLen): #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写 wordList = Bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) #将该样本词汇添加到docList中 fullText.extend(wordList) #将该样本词汇追加到fullText中 classList.append(1) #将样本类别信息添加到classList wordList = Bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) #获取docList中所有不重复的单词列表 #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。 #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。 top30Words = calcMostFreq(vocabList, fullText) #获取在fullText中出现次数最多的30个词汇信息 for pairW in top30Words: #从词汇表vocabList中去除出现次数最多的30个单词 if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2 * minLen) #定义列表变量存储训练样本id print 'minLen : %d' % minLen if minLen < 20: print 'the len is too small.' testSet = [] #用于存储测试样本id for i in range(20): #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中 for docIndex in trainingSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex])) #获取当前样本的分类信息classList[docIndex] trainClasses.append(classList[docIndex]) #通过贝叶斯分类器对训练样本进行学习 #获取两个类别各自单词的出现频率,以及样本集的概率 p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 #使用测试样本集对学习结果进行测试 for docIndex in testSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex]) #对当前测试样本进行分类,判断是否与已知类型相同 if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) #打印出错误率 return vocabList, p0V, p1V #返回词汇表和各个词汇的出现概率
def localWord(feed0,feed1): minLen = min(len(feed1['entries']),len(feed0['entries'])) listOfPost = [];classVec = [];fullText = [] for i in range(minLen): wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始 listOfPost.append(wordList) fullText.extend(wordList) classVec.append(1) wordList = Bayes.textParse(feed0['entries'][i]['summary']) listOfPost.append(wordList) fullText.extend(wordList) classVec.append(0) vocabList = Bayes.creatVocabList(listOfPost) top30Words = calMostFreq(vocabList,fullText) for pairW in top30Words: if pairW in vocabList:vocabList.remove(pairW) trainingSet = range(2*minLen) ; dataSet = [] for i in range(20): randIndex = int(np.random.uniform(len(trainingSet))) dataSet.append(randIndex) del(trainingSet[randIndex]) trainMat = [];trainClass = [] for docIndex in trainingSet: trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])) trainClass.append(classVec[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass) errorCount = 0.0 for docIndex in dataSet: dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]) if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]: errorCount += 1 # print "the error rate is :",errorCount/float(len(dataSet)) return vocabList,p0V,p1V
def testingNB(): listOPosts, listClasses = loadDataSet() myVocabList = Bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc)) trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb)