def testingNB(): listOfPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOfPosts) trainMatrix = [] for post in listOfPosts: trainMatrix.append(bayes.setOfWordsToVector(myVocabList, post)) p0,p1, pAbusive = bayes.trainNB(array(trainMatrix), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWordsToVector(myVocabList, testEntry)) print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0, p1, pAbusive)) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWordsToVector(myVocabList, testEntry)) print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0, p1, pAbusive))
def localWords(feed0, feed1): import feedparser import numpy as np docList=[] classList=[] fullText=[] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) top30Words = calculateMostFrequentValues(vocabList, fullText) for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingset = list(range(2*minLen)) testSet = [] for i in range(20): randIndex = int(np.random.uniform(0, len(trainingset))) testSet.append(trainingset[randIndex]) del(trainingset[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingset: trainMat.append(bayes.bagOfWordsToVetor(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0v, p1v, pSpam = bayes.trainNB(np.asarray(trainMat), np.asarray(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWordsToVetor(vocabList, docList[docIndex]) if bayes.classifyNB(np.asarray(wordVector), p0v, p1v, pSpam) != classList[docIndex]: errorCount +=1 print('the error rate is: ', float(errorCount)/len(testSet)) return vocabList, p0v, p1v