def testingNB(): listOPosts,listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat=[] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = bayes.trainNB0(array(trainMat),array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',bayes.classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',bayes.classifyNB(thisDoc,p0V,p1V,pAb)
def testingNB(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['quit', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
def testingNB(): postList, classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postList) trainMat = [] for post in postList: trainMat.append(bayes.setOfWords2Vec(myVocabList, post)) p0V, p1V, pAb = bayes.trainNB0(trainMat, classList) testEntry = ['love', 'my', 'dalmation'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('xxx/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('xxx/%d,txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWord2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainingMat = [] trainingClasses = [] for docIndex in trainingSet: trainingMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainingClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNBO(trainingMat, trainingClasses) errorCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print ("the error rate is: ", float(errorCount)/len(testSet))
def spamTest(): docList = []; classList=[]; fullText=[] for i in range(1, 26): wordList = textParse(open('./email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('./email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocalList = bayes.createVocabList(docList) trainingSet = range(50); testSet=[] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[];trainClass=[]; for docIndex in trainingSet: trainMat.append(bayes.addWordInList(vocalList, docList[docIndex])) trainClass.append(classList[docIndex]) p0V,p1V,ps = bayes.trainNB0(array(trainMat), array(trainClass)) errorCount = 0 for doc in testSet: wordV = bayes.addWordInList(vocalList, docList[doc]) if bayes.classifyNB(array(wordV), p0V, p1V, ps) != classList[doc]: errorCount+=1 print 'err count is %d' % errorCount
def spamTest(): docList = []; classList = []; fullText = [] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) # range对象不支持del,所以要转成list trainingSet = list(range(50)); testSet = [] # 随机取10组数据作为测试机 for i in range(10): # 循环10次 # 在0到49之间(包括0,49)取随机整数 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) # 删除数组的引用 # 把剩下的40组数据作为训练集 trainMat = []; trainClasses = [] for docIndex in trainingSet: # 由于上面删除了10个引用,还剩40个 trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNBO(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: # 转成词向量 wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ', float(errorCount)/len(testSet))
def spamTest(): docList = [] classList =[] fullText = [] for i in range(1, 26): wordList = bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i,errors='ignore').read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) dataSet = list(range(50)) testSet = [] for i in range(10): randIndex = int(np.random.uniform(0, len(dataSet))) testSet.append(dataSet[randIndex]) del(dataSet[randIndex]) trainMat = [] trainClasses = [] for doc in dataSet: trainMat.append(bayes.bagOfWord2Vec(vocabList, docList[doc])) trainClasses.append(classList[doc]) pAbusive, p1Vect, p0Vect = bayes.trainNB0(trainMat, trainClasses) errorCount = 0.0 for i in testSet: vec2Classify = bayes.bagOfWord2Vec(vocabList, docList[i]) if bayes.classifyNB(vec2Classify, p1Vect, p0Vect, pAbusive) != classList[i]: errorCount += 1 return errorCount/10
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('./email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('./email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocalList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClass = [] for docIndex in trainingSet: trainMat.append(bayes.addWordInList(vocalList, docList[docIndex])) trainClass.append(classList[docIndex]) p0V, p1V, ps = bayes.trainNB0(array(trainMat), array(trainClass)) errorCount = 0 for doc in testSet: wordV = bayes.addWordInList(vocalList, docList[doc]) if bayes.classifyNB(array(wordV), p0V, p1V, ps) != classList[doc]: errorCount += 1 print 'err count is %d' % errorCount
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): # 导入并解析文本 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] # 随机构建训练集 for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 # 对测试集分类 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is:', float(errorCount) / len(testSet)
def testNB(): listOPosts, listClasses = bayes.loadDataSet() #加载数据 myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) resultLabel = {0: 'Not garbage', 1: 'Garbage'} testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)]) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])
def localWords(feed1, feed0): import feedparser docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) # print('minlen=%d'%minLen) for i in range(minLen): # visit one rss source every time wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) # visit rss 0 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) # remove most frequent words vocabList = bayes.createVocabList(docList) top30words = calcMostFreq(vocabList, fullText) for pairW in top30words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) testSet = [] for i in range(20): randIdx = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIdx]) del (trainingSet[randIdx]) trainMat = [] trainClasses = [] for docIdx in trainingSet: # print("doc idx:%d, len=%d" %( docIdx, len(docList))) trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIdx])) trainClasses.append(classList[docIdx]) p0V, p1V, pSpam = bayes.trainNB0(numpy.array(trainMat), numpy.array(trainClasses)) errorCount = 0 for docIdx in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIdx]) if bayes.classifyNB(numpy.array(wordVector), p0V, p1V, pSpam) != classList[docIdx]: errorCount += 1 print('the error rate is: %.2f' % (float(errorCount) / len(testSet))) return vocabList, p0V, p1V
def main(): print '开始测试...' listOPosts, listClasses = loadDataSet() myVocabList = docTool.createVocabList(listOPosts) print '词汇表:\n', myVocabList wordsVec = docTool.setOfWords2Vec(myVocabList, listOPosts[0]) print '将第一句话转换成向量,存在的单词为1,不存在的单词为0\n', wordsVec trainMat = [] for postinDoc in listOPosts: trainMat.append(docTool.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] # 将testEntry转化成特征量的组合,也就是一个要求的样本 thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry)) label = bayes.classifyNB(thisDoc, p0V, p1V, pAb) print testEntry, '分类是:',label testEntry = ['stupid', 'garbage'] # 将testEntry转化成特征量的组合,也就是一个要求的样本 thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry)) label = bayes.classifyNB(thisDoc, p0V, p1V, pAb) print testEntry, '分类是:',label print '测试结束...'
def main(): print '开始测试...' listOPosts, listClasses = loadDataSet() myVocabList = docTool.createVocabList(listOPosts) print '词汇表:\n', myVocabList wordsVec = docTool.setOfWords2Vec(myVocabList, listOPosts[0]) print '将第一句话转换成向量,存在的单词为1,不存在的单词为0\n', wordsVec trainMat = [] for postinDoc in listOPosts: trainMat.append(docTool.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] # 将testEntry转化成特征量的组合,也就是一个要求的样本 thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry)) label = bayes.classifyNB(thisDoc, p0V, p1V, pAb) print testEntry, '分类是:', label testEntry = ['stupid', 'garbage'] # 将testEntry转化成特征量的组合,也就是一个要求的样本 thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry)) label = bayes.classifyNB(thisDoc, p0V, p1V, pAb) print testEntry, '分类是:', label print '测试结束...'
def localWords(feed1, feed0): docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = st.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = st.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #remove top frequent words top30Words = calcMostFreq(vocabList, fullText) for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) #build training and testing set trainingSet = range(2*minLen) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) #training p0V, p1V, pLocal = bayes.trainNB0(array(trainMat), array(trainClasses)) #testing errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pLocal) != classList[docIndex]: errorCount+=1 print 'the error rate is: ', float(errorCount)/len(testSet) return vocabList, p0V, p1V
def spamTest(): emailList = [] classes = [] # 读入数据 for i in xrange(1, 26): # 读取垃圾邮件 email = textParse(open('../data/email/spam/%d.txt' % i).read()) emailList.append(email) classes.append(1) # 读取正常邮件 email = textParse(open('../data/email/ham/%d.txt' % i).read()) emailList.append(email) classes.append(0) # 构建词向量 vocabularyList = bayes.vocabularyList(emailList) # 构建词频矩阵 dataSet = bayes.wordsMatrix(vocabularyList, emailList) # 建立训练集和测试集 test_num = 20 #测试集数量 testingData = [] testingClasses = [] for i in xrange(test_num): testIndex = int(random.uniform(0, len(dataSet))) testingData.append(dataSet[testIndex]) testingClasses.append(classes[testIndex]) # 在原有数据中删除 del (dataSet[testIndex]) del (classes[testIndex]) # 训练模型 p0, p1, pc1 = bayes.trainModel(dataSet, classes) # 计算测试误差 errorCount = 0 i = 0 for testSample in testingData: result = bayes.classifyNB(testSample, p0, p1, pc1) if result != testingClasses[i]: errorCount += 1 print "分类错误" i += 1 errorRate = float(errorCount) / test_num print "错误率 %f" % errorRate return errorRate
def localWord(feed1, feed0): docList = [] classList = [] fullText = [] minLength = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLength): wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) top30words = calMostFeq(vocabList, fullText) for pairW in top30words: if (pairW[0] in vocabList): vocabList.remove(pairW[0]) trainingSet = range(2 * minLength) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMatrix = [] trainClass = [] for docIndex in trainingSet: trainMatrix.append(bayes.bagOfWordsToVec(vocabList, docList[docIndex])) trainClass.append(classList[docIndex]) p0V, p1V, pSapm = bayes.trainNB0(trainMatrix, trainClass) errorCount = 0 for docIndex in testSet: calculatedClass = bayes.classifyNB( bayes.bagOfWordsToVec(vocabList, docList[docIndex]), p0V, p1V, pSapm) if (calculatedClass != classList[docIndex]): errorCount += 1 print "error rate is: ", float(errorCount) / len(testSet) return vocabList, p0V, p1V
def spamTest(): ''' 对贝叶斯垃圾邮件分类器进行自动化处理 对测试集中的每封邮件进行分类,若邮件分类错误,测错误数加1,最后返回总的错误百分比 ''' docList = [] classList = [] fullText = [] for i in range(1, 26): #切分,解析数据,并归类为1的类别 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) #创建词汇表 vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] #随机取10个邮件测试 for i in range(10): #random.uniform(x,y)随机产生一个范围为x-y的实数 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 errorDoc = [] for docIndex in testSet: wordVector = bayes.setWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 errorDoc.append(docList[docIndex]) print(vocabList) print(trainMat) print('the errorCount is: ', errorCount) print('the testSet length is: ', len(testSet)) print('the error rate is; ', float(errorCount) / len(testSet)) print(errorDoc)
def localWords(feed1, feed0): import feedparser docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): # 每次访问一条RSS源 wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) top30Words = calcMostFreq(vocabList, fullText) # 去掉出现频数最高的词 for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is:', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
def spamTest(): docList = [] classList = [] fullText = [] # read the mail for i in range(1,26): wordlist1 = textParse(open('./email/spam/%d.txt' %i).read()) docList.append(wordlist1) fullText.extend(docList) classList.append(1) wordlist0 = textParse(open('./email/ham/%d.txt' %i).read()) docList.append(wordlist0) fullText.extend(docList) classList.append(0) # get the dictionary vablist = bayes.createVocablist(docList) # Random Test dateset trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] # Get the train dateset for docIndex in trainingSet: trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex])) trainClasses.append(classList[docIndex]) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses) # test the bayes errorCount = 0 for docIndex in testSet: testVec = bayes.setOfwords2Vec(vablist, docList[docIndex]) result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa) if result != classList[docIndex]: errorCount += 1 errorrate = float(errorCount) / len(testSet) print "the filter spam mail error rate is %f" %errorrate
def spamTest(): docList = [] classList = [] fullText = [] # read the mail for i in range(1, 26): wordlist1 = textParse(open('./email/spam/%d.txt' % i).read()) docList.append(wordlist1) fullText.extend(docList) classList.append(1) wordlist0 = textParse(open('./email/ham/%d.txt' % i).read()) docList.append(wordlist0) fullText.extend(docList) classList.append(0) # get the dictionary vablist = bayes.createVocablist(docList) # Random Test dateset trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] # Get the train dateset for docIndex in trainingSet: trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex])) trainClasses.append(classList[docIndex]) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses) # test the bayes errorCount = 0 for docIndex in testSet: testVec = bayes.setOfwords2Vec(vablist, docList[docIndex]) result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa) if result != classList[docIndex]: errorCount += 1 errorrate = float(errorCount) / len(testSet) print "the filter spam mail error rate is %f" % errorrate
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = docTool.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(docTool.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(trainMat, trainClasses) errorCount = 0 for docIndex in testSet: wordVector = docTool.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print '错误率: ', float(errorCount) / len(testSet)
def localWords(feed1, feed0): # 两份RSS文件分别经feedparser解析,得到2个字典 docList = [] # 一条条帖子组成的List, 帖子拆成了单词 classList = [] # 标签列表 fullText = [] # 所有帖子的所有单词组成的List # entries条目包含多个帖子,miNLen记录帖子数少的数目,怕越界 minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = bayes.textParse(feed1['entries'][i]['summary']) # 取出帖子内容,并拆成词 docList.append(wordList) # ['12','34'].append(['56','78']) ==> [ ['12','34'], ['56','78'] ] fullText.extend(wordList) # ['12','34'].extend(['56','78']) ==> ['12','34','56','78'] classList.append(1) # 纽约的标签是1 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 旧金山的标签是0 vocabList = bayes.createVocabList(docList) # 创建词汇表 # 从fulltext中找出最高频的30个单词,并从vocabList中去除它们 top30Words = calcMostFreq(vocabList, fullText) for (word, count) in top30Words: if word in vocabList: vocabList.remove(word) trainingSet = range(2 * minLen); testSet = [] # 创建训练集、测试集 for i in range(minLen / 10): # 随机选取10%的数据,建立测试集 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) # 将训练集中的每一条数据,转化为词向量 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) # 开始训练 # 用测试数据,测试分类器的准确性 errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) return vocabList, p0V, p1V
def spamTest(): """ 将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集, 50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证) :return: """ docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] for i in range(10): #随机选出10篇 randIndex = int(random.uniform( 0, len(trainingSet))) #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。 testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #遍历训练集中所有的文档 trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) #构建词向量 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #计算分类所需的概率 errorCount = 0 for docIndex in testSet: #遍历测试集 wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is :', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): # assume we have 25 emails for normal email and spam wordList = textParse( codecs.open(baseURI + 'email/spam/%d.txt' % i, encoding='ANSI').read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse( open(baseURI + 'email/ham/%d.txt' % i, encoding='ANSI').read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.create_vocab_list(docList) #create vocabulary trainingSet = list(range(50)) testSet = [] #create test set # this will pop out 10 emails of traning set for testing algorithm randomly for i in range(10): # np.random.uniform(low, high) draw samples from a uniform distribution # The probability density function of the uniform distribution is p(x)=1/(high-low) randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bagOfwords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("classification error", docList[docIndex]) print('the error rate is: ', float(errorCount) / len(testSet))
def spamTest(): docList=[]; classList = []; fullText =[] for i in range(1,26): str=open('email/spam/%d.txt' % i).read() print "str" print str wordList = bayes.textParse(str) print "wordlist" print wordList docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) print "doclist" print len(docList) print docList print "fulllist" print len(fullText) print fullText print classList vocabList = bayes.createVocabList(docList)#create vocabulary trainingSet = range(50); testSet=[] #create test set for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses = [] for docIndex in trainingSet:#train the classifier (get probs) trainNB0 trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print "classification error",docList[docIndex] print 'the error rate is: ',float(errorCount)/len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] # parse text from email for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) #parse text wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #build training set and test set trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) #select testset and remove the testset from all dataset testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #classify and test precision errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def localWords(feed1, feed0): import feedparser docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) print feed1['entries'] print feed0['entries'] for i in range(minLen): wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) #NY is class 1 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #create vocabulary top30Words = calcMostFreq(vocabList, fullText) #remove top 30 words for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) testSet = [] #create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #train the classifier (get probs) trainNB0 trainMat.append(bayes.bagOfWord2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayes.bagOfWord2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
def spamTest(): """ 对贝叶斯垃圾邮件分类器进行自动化处理 :return: """ docList = [] classList = [] fullText = [] for i in range(1, 26): # 1. 导入并解析文本文件 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.append(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.append(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): # 2. 随机构建训练集 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 # 3. 对测试集分类 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] # parse text from email for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) #parse text wordList = textParse(open('email/ham/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #build training set and test set trainingSet = range(50); testSet =[] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) #select testset and remove the testset from all dataset testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #classify and test precision errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount)/len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] # randomly split data set into 2 sets: test set, and training set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) # random int 0~len testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) # split trainMat = []; trainClasses = []; for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "error word: %s" % (docList[docIndex]) print "error rate is: %f", float(errorCount) / len(testSet)
def spamTest(): docList = [] classList =[] fullText = [] #导人文件夹spam与ham下的文本文件,并将它们解析为词列表 for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50);#本例中共有50封电子邮件,其中的值从0到49 testSet = [] '''选择出的数字所对应的文档被添加到测试集, 同时也将其从训练集中剔除。 ''' for i in range(10):#10封电子邮件被随机选择为测试集。 randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMatrix=[]; trainClasses = [] for docIndex in trainingSet: trainMatrix.append(bayes.setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = bayes.trainNB(array(trainMatrix),array(trainClasses)) errorCount=0 for docIndex in testSet: #如果邮件分类错误,则错误数加1,最后给出总的错误百分比 wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex]) if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]: errorCount +=1 print 'the error rate is :',float(errorCount) /len(testSet)
def spamTest(): docList = [] classList = [] #fullText = [] #没起作用 for i in range(1,26): #此案例中样本集名为1.txt~25.txt wordList = textParse(open('email/spam/%d.txt' % i).read()) #解析邮件,分隔成一个个词汇 docList.append(wordList) #将样本内容存储到docList中 #fullText.extend(wordList) classList.append(1) #spam下对应的类别标签设置为1 wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) #fullText.extend(wordList) classList.append(0) #ham下对应的类别标签设置为0 vocabList = bayes.createVocabList(docList) #通过docList获取全部的词汇表 trainingSet = list(range(50)) #此处共50个案例,与classList长度对应 testSet = [] #存储测试样本集 for i in list(range(10)): randIndex = int(random.uniform(0,len(trainingSet))) #随机提取样本作为测试样本 testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) #把测试样本从训练样本中剔除 trainMat = [] trainClasses = [] for docIndex in trainingSet:#遍历训练样本集 trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) #获取样本中使用词汇情况向量 trainClasses.append(classList[docIndex]) #获取当前样本的类别标签 p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #训练算法,得到概率 errorCount = 0 for docIndex in testSet: #遍历测试样本集 wordVector=bayes.setOfWords2Vec(vocabList, docList[docIndex]) resultFlag = bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) #使用分类函数进行分类 if(resultFlag != classList[docIndex]): #如果得到结果不正确,则错误数加上1 errorCount += 1 print('the error rate is: ', float(errorCount)/len(testSet))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] # 创建存储训练集的索引值的列表和测试集的索引值的列表 for i in range(10): # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集 randIndex = int(random.uniform(0, len( trainingSet))) #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high. testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNBO(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('wrong testSet:', docList[docIndex]) print('wrong rate:%.2f%%' % (float(errorCount) / len(testSet) * 100))
__author__ = 'jack' import bayes listOPosts,listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print myVocabList #print bayes.setOfWords2Vec(myVocabList, listOPosts[0]) trainMat = [] for post in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, post)) p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses)) print pAb print p0V print p1V testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print thisDoc print testEntry, 'classsified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print thisDoc print testEntry, 'classsified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
#!/usr/bin/python #encoding:utf-8 import bayes from numpy import * postingList, classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postingList) trainMat = [] for a in postingList: trainMat.append(bayes.setOfWords(myVocabList, a)) pc, p0, p1 = bayes.trainNB0(trainMat, classList) test = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords(myVocabList, test)) print bayes.classifyNB(thisDoc, p0, p1, pc)
def spamTest(): emailList = [] classes = [] # 读入数据 for i in xrange(1,26): # 读取垃圾邮件 email = textParse(open('../data/email/spam/%d.txt' %i).read()) emailList.append(email) classes.append(1) # 读取正常邮件 email = textParse(open('../data/email/ham/%d.txt' %i).read()) emailList.append(email) classes.append(0) # 构建词向量 vocabularyList = bayes.vocabularyList(emailList) # 构建词频矩阵 dataSet = bayes.wordsMatrix(vocabularyList, emailList) # 建立训练集和测试集 test_num = 20 #测试集数量 testingData = [] testingClasses = [] for i in xrange(test_num): testIndex = int(random.uniform(0, len(dataSet))) testingData.append(dataSet[testIndex]) testingClasses.append(classes[testIndex]) # 在原有数据中删除 del(dataSet[testIndex]) del(classes[testIndex]) # 训练模型 p0, p1, pc1 = bayes.trainModel(dataSet, classes) # 计算测试误差 errorCount = 0 i = 0 for testSample in testingData: result = bayes.classifyNB(testSample, p0, p1, pc1) if result != testingClasses[i]: errorCount += 1 print "分类错误" i += 1 errorRate = float(errorCount) / test_num print "错误率 %f" %errorRate return errorRate
import DividingData import bayes from matplotlib import pyplot as plt import numpy as np #生成五次实验的测试数据、训练数据、标注文件 for i in range(5): rightCateFile='classifyRightCate'+str(i)+'.txt' DividingData.dataSeg(i,rightCateFile) #朴素贝斯分类器对五次实验的测试数据进行分类 for i in range(5): trainDir='trainData'+str(i) testDir='testData'+str(i) resultCateFile='classifyResultCate'+str(i)+'.txt' bayes.classifyNB(trainDir,testDir,resultCateFile) #计算并记录五次实验的错误率 errorRateRecord=[] for i in range(5): rightCateFile='classifyRightCate'+str(i)+'.txt' resultCateFile='classifyResultCate'+str(i)+'.txt' er=bayes.errorRate(rightCateFile,resultCateFile) errorRateRecord.append(er) #绘制条形图:可视化每次实验的错误率 fig=plt.figure(1) ax1=plt.subplot(111) data=np.array([float(format(v,'.3f')) for v in errorRateRecord]) width=0.5 x_bar=np.arange(5)
#!/usr/bin/python #encoding:utf-8 import bayes from numpy import * postingList , classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postingList) trainMat = [] for a in postingList: trainMat.append( bayes.setOfWords( myVocabList, a ) ) pc, p0, p1 = bayes.trainNB0(trainMat, classList) test = ['stupid', 'garbage'] thisDoc = array( bayes.setOfWords( myVocabList, test ) ) print bayes.classifyNB(thisDoc, p0, p1, pc)
if __name__ == '__main__': weibo_emotion, weibo_content, weibo_content_emotion = get_dataset( XML_TRAIN_PATH) bow, l1_p0_vect, l1_p1_vect, p_emotion, l2_p0_vect, l2_p1_vect, p_positive = \ process_training_xml(weibo_emotion, weibo_content, weibo_content_emotion) test_weibo_emotion, test_content, test_content_emotion = get_dataset( XML_TEST_PATH) test_mat, test_cat = convert_to_mat_and_cat(bow, test_content, test_content_emotion) # 进行两种情感的分类 l1_pre_cat = map( lambda each: bayes.classifyNB(each, l1_p0_vect, l1_p1_vect, p_emotion), test_mat) l1_pre_cat_cnt = adder() test_content = map( lambda weibo: map( lambda sentence: (sentence, l1_pre_cat[l1_pre_cat_cnt()]), weibo), test_content) l2_pre_cat = map( lambda each: bayes.classifyNB(each, l2_p0_vect, l2_p1_vect, p_positive ), test_mat) l2_pre_cat_cnt = adder() def _mark_emotion(each): if each[1] == 0: return each
return Vec, out if __name__ == '__main__': googDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\goods.txt' badDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\bad.txt' # 1 好评 0 差评 goodVec, goodList = DataHandle(googDataPath, 1) badVec, badList = DataHandle(badDataPath, 0) listClasses = goodVec + badVec listOPosts = goodList + badList print(listClasses) print(listOPosts) myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) # 3. 计算单词是否出现并创建数据矩阵 trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # 4. 训练数据 p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses)) # 5. 测试数据 while True: inputS = input(u'请输入您对本商品的评价:') testEntry = wordCut(inputS) thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print('评价: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb))
def local_words(feed1, feed0): """ feed1: rss源1 feed0: rss源0 """ import feedparser # 文档列表 doc_list = [] # 文档类别列表 class_list = [] full_text = [] # 计算两个源中用于训练的数据的数量 min_len = min(len(feed1['entries']), len(feed0['entries'])) # 遍历每个训练数据(文档) for i in range(min_len): # 分词,去掉标点符号 word_list = text_parse(feed1['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) # 分词,去掉标点符号 word_list = text_parse(feed0['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) # 构造单词列表 vocab_list = create_vocab_list(doc_list) # 统计出出现频率前30的单词 top30_words = calc_most_freq(vocab_list, full_text) # 从单词中去掉高频词汇 for pair_w in top30_words: if pair_w[0] in vocab_list: vocab_list.remove(pair_w[0]) # 因为是两个源的数据所以*2 training_set = range(2 * min_len) #print(training_set) # 接下来构造测试数据集 test_set = [] for i in range(20): # 随机从训练数据集中获得20个数据,同时在训练数据集中将其删除 rand_index = int(random.uniform(0, len(training_set))) test_set.append(training_set[rand_index]) del (training_set[rand_index]) train_mat = [] train_classes = [] # 下面遍历training_set构造出最终用于训练的文档向量和标签 for doc_index in training_set: # 用词袋模型构造每个文档的向量 train_mat.append(bag_of_word2vec(vocab_list, doc_list[doc_index])) train_classes.append(class_list[doc_index]) # 训练贝叶斯分类器,这里由于使用两源同样数量的数据,所以p_spam为0.5 p0_v, p1_v, p_spam = trainNB0(np.array(train_mat), np.array(train_classes)) error_count = 0 # 测试数据 for doc_index in test_set: # 构造测试文档的向量 word_vec = bag_of_word2vec(vocab_list, doc_list[doc_index]) if classifyNB(np.array(word_vec), p0_v, p1_v, p_spam) != class_list[doc_index]: error_count += 1 print('the error rate is: ', float(error_count) / len(test_set)) return vocab_list, p0_v, p1_v