def localWords(feed1, feed0): docList=[]; classList = []; fullText =[] minLen = min(len(feed1['entries']),len(feed0['entries'])) for i in range(minLen): wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) #NY is class 1 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) top30Words = calcMostFreq(vocabList, fullText) for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2 * minLen); testSet=[] # create test set for i in range(20): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMat=[]; trainClasses = [] # train the classifier (get probs) train for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) pVectDict, pCateDict = bayes.train(array(trainMat), trainClasses) errorCount = 0 # classify the remaining items for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classify(array(wordVector), pVectDict, pCateDict) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) return vocabList, pVectDict
def spamTest(): docList=[]; classList = []; fullText =[] for i in range(1, 26): wordList = bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50); testSet=[] # create test set for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMat=[]; trainClasses = [] # train the classifier (get probs) train for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) pVectDict, pCateDict = bayes.train(array(trainMat), trainClasses) errorCount = 0 # classify the remaining items for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classify(array(wordVector), pVectDict, pCateDict) != classList[docIndex]: errorCount += 1 print "classification error:", docList[docIndex] print 'the error rate is: ',float(errorCount) / len(testSet)