def crossValidClassifier(self): dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) trainingSet = range(51); testSet = [] #create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet:#train the classifier (get probs) trainNB0 trainMat.append(bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex])) trainClasses.append(labels[docIndex]) p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses)) clabels = ['军事', '体育'] data = self.getData('data/bayesian/rss/rss_junshi.txt') + self.getData('data/bayesian/rss/rss_tiyu.txt') errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex]) type = bayesian.predict(array(wordVector), p0V, p1V, pSpam) if type != labels[docIndex]: errorCount += 1 print "判断类型:", clabels[type] print "classification error", data[docIndex] print "---------------------------------------" print 'the error rate is: ', float(errorCount) / len(testSet)
def SingleClassifier(self): ## 加载RSS源并将其保存为文本文件 ## 除非是生成新数据,否则不执行这段代码 #juns_count = rss.loadRSS('http://mil.sohu.com/rss/junshi.xml','data/bayesian/rss/rss_junshi.txt') #tiyu_count = rss.loadRSS('http://rss.news.sohu.com/rss/sports.xml','data/bayesian/rss/rss_tiyu.txt' ) #print juns_count #print tiyu_count dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) ## 建立bag of words 矩阵 trainMat = [] for postinDoc in dataMat: trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc)) ## 计算已有数据集中的先验概率 p0V, p1V, pAb = bayesian.fit(array(trainMat), array(labels)) ## 测试不同字符串的后验概率 testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。" testEntry = self.testEntryProcess(testText) thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) clabels = ['军事', '体育'] print testText, 'classified as: ', clabels[bayesian.predict(thisDoc, p0V, p1V, pAb)]
def crossValidClassifier(self): dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) trainingSet = range(51) testSet = [] #create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #train the classifier (get probs) trainNB0 trainMat.append( bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex])) trainClasses.append(labels[docIndex]) p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses)) clabels = ['军事', '体育'] data = self.getData('data/bayesian/rss/rss_junshi.txt') + self.getData( 'data/bayesian/rss/rss_tiyu.txt') errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex]) type = bayesian.predict(array(wordVector), p0V, p1V, pSpam) if type != labels[docIndex]: errorCount += 1 print "判断类型:", clabels[type] print "classification error", data[docIndex] print "---------------------------------------" print 'the error rate is: ', float(errorCount) / len(testSet)
def SingleClassifier(self): ## 加载RSS源并将其保存为文本文件 ## 除非是生成新数据,否则不执行这段代码 #juns_count = rss.loadRSS('http://mil.sohu.com/rss/junshi.xml','data/bayesian/rss/rss_junshi.txt') #tiyu_count = rss.loadRSS('http://rss.news.sohu.com/rss/sports.xml','data/bayesian/rss/rss_tiyu.txt' ) #print juns_count #print tiyu_count dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) ## 建立bag of words 矩阵 trainMat = [] for postinDoc in dataMat: trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc)) ## 计算已有数据集中的先验概率 p0V, p1V, pAb = bayesian.fit(array(trainMat), array(labels)) ## 测试不同字符串的后验概率 testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。" testEntry = self.testEntryProcess(testText) thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) clabels = ['军事', '体育'] print testText, 'classified as: ', clabels[bayesian.predict( thisDoc, p0V, p1V, pAb)]
def testingNB(self): ## 加载已有数据集 listOPosts, listClasses = self.loadDataSet() bayesian = Bayesian() myVocabList = bayesian.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc)) ## 计算已有数据集中的先验概率 p0V, p1V, pAb = bayesian.fit(array(trainMat), array(listClasses)) ## 测试不同字符串的后验概率 testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
def spamTest(self, bayesian): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = self.textParse( open('data/bayesian/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = self.textParse( open('data/bayesian/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) bayesian = Bayesian() vocabList = bayesian.createVocabList(docList) #create vocabulary trainingSet = range(50) testSet = [] #create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #train the classifier (get probs) trainNB0 trainMat.append( bayesian.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayesian.setOfWords2Vec(vocabList, docList[docIndex]) if bayesian.predict(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "分类错误", docList[docIndex] print '错误率是: ', float(errorCount) / len(testSet) #return vocabList,fullText
def spamTest(self, bayesian): docList = []; classList = []; fullText = [] for i in range(1, 26): wordList = self.textParse(open('data/bayesian/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = self.textParse(open('data/bayesian/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) bayesian = Bayesian() vocabList = bayesian.createVocabList(docList)#create vocabulary trainingSet = range(50); testSet = [] #create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet:#train the classifier (get probs) trainNB0 trainMat.append(bayesian.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayesian.setOfWords2Vec(vocabList, docList[docIndex]) if bayesian.predict(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "分类错误", docList[docIndex] print '错误率是: ', float(errorCount) / len(testSet) #return vocabList,fullText