def localWords(feed1, feed0): import feedparser docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) # print('minlen=%d'%minLen) for i in range(minLen): # visit one rss source every time wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) # visit rss 0 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) # remove most frequent words vocabList = bayes.createVocabList(docList) top30words = calcMostFreq(vocabList, fullText) for pairW in top30words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) testSet = [] for i in range(20): randIdx = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIdx]) del (trainingSet[randIdx]) trainMat = [] trainClasses = [] for docIdx in trainingSet: # print("doc idx:%d, len=%d" %( docIdx, len(docList))) trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIdx])) trainClasses.append(classList[docIdx]) p0V, p1V, pSpam = bayes.trainNB0(numpy.array(trainMat), numpy.array(trainClasses)) errorCount = 0 for docIdx in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIdx]) if bayes.classifyNB(numpy.array(wordVector), p0V, p1V, pSpam) != classList[docIdx]: errorCount += 1 print('the error rate is: %.2f' % (float(errorCount) / len(testSet))) return vocabList, p0V, p1V
def localWords(feed1, feed0): docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = st.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = st.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #remove top frequent words top30Words = calcMostFreq(vocabList, fullText) for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) #build training and testing set trainingSet = range(2*minLen) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) #training p0V, p1V, pLocal = bayes.trainNB0(array(trainMat), array(trainClasses)) #testing errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pLocal) != classList[docIndex]: errorCount+=1 print 'the error rate is: ', float(errorCount)/len(testSet) return vocabList, p0V, p1V
def localWords(feed1, feed0): import feedparser docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): # 每次访问一条RSS源 wordList = bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) top30Words = calcMostFreq(vocabList, fullText) # 去掉出现频数最高的词 for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) testSet = [] for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is:', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
def test_bagOfWords2VecMN(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) features = bayes.bagOfWords2VecMN(myVocabList, listOPosts[0]) expected = [ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1 ] self.assertEqual(features, expected)
def localWords(feed1, feed0): # 两份RSS文件分别经feedparser解析,得到2个字典 docList = [] # 一条条帖子组成的List, 帖子拆成了单词 classList = [] # 标签列表 fullText = [] # 所有帖子的所有单词组成的List # entries条目包含多个帖子,miNLen记录帖子数少的数目,怕越界 minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = bayes.textParse(feed1['entries'][i]['summary']) # 取出帖子内容,并拆成词 docList.append(wordList) # ['12','34'].append(['56','78']) ==> [ ['12','34'], ['56','78'] ] fullText.extend(wordList) # ['12','34'].extend(['56','78']) ==> ['12','34','56','78'] classList.append(1) # 纽约的标签是1 wordList = bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 旧金山的标签是0 vocabList = bayes.createVocabList(docList) # 创建词汇表 # 从fulltext中找出最高频的30个单词,并从vocabList中去除它们 top30Words = calcMostFreq(vocabList, fullText) for (word, count) in top30Words: if word in vocabList: vocabList.remove(word) trainingSet = range(2 * minLen); testSet = [] # 创建训练集、测试集 for i in range(minLen / 10): # 随机选取10%的数据,建立测试集 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) # 将训练集中的每一条数据,转化为词向量 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) # 开始训练 # 用测试数据,测试分类器的准确性 errorCount = 0 for docIndex in testSet: wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) return vocabList, p0V, p1V
def spamTest(): docList=[]; classList = []; fullText =[] for i in range(1,26): str=open('email/spam/%d.txt' % i).read() print "str" print str wordList = bayes.textParse(str) print "wordlist" print wordList docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) print "doclist" print len(docList) print docList print "fulllist" print len(fullText) print fullText print classList vocabList = bayes.createVocabList(docList)#create vocabulary trainingSet = range(50); testSet=[] #create test set for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses = [] for docIndex in trainingSet:#train the classifier (get probs) trainNB0 trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print "classification error",docList[docIndex] print 'the error rate is: ',float(errorCount)/len(testSet)
def spamTest(): """ 将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集, 50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证) :return: """ docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] for i in range(10): #随机选出10篇 randIndex = int(random.uniform( 0, len(trainingSet))) #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。 testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #遍历训练集中所有的文档 trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) #构建词向量 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #计算分类所需的概率 errorCount = 0 for docIndex in testSet: #遍历测试集 wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is :', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
''' @author: laiwei ''' import bayes listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) print(listOPosts) print(listClasses) print(bayes.setOfWords2Vec(myVocabList, listOPosts[0])) print(bayes.bagOfWords2VecMN(myVocabList, listOPosts[0]))