def localWords(feed1, feed0): docList = [] classList = [] fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) # print(minLen) for i in range(minLen): wordList = sd.textParse(feed1['entries'][i]['summary']) # print(wordList) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = sd.textParse(feed0['entries'][i]['summary']) # print(wordList) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bf.createVocabList(docList) # 创建词汇表 # print(vocabList) top30Words = calcMostFreq(vocabList, fullText) # 删除前30个单词 # print(top30Words) for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = list(range(2 * minLen)) # print(trainingSet) testSet = [] # 创建测试集 for i in range(20): randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # 训练分类器(get probs) trainNB0 trainMat.append(bf.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = of.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: # 对剩余项目进行分类 wordVector = bf.bagOfWords2VecMN(vocabList, docList[docIndex]) if tt.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("分类错误的测试集:", docList[docIndex]) print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100)) return vocabList, p0V, p1V
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): # 遍历25个txt文件 wordList = textParse(open('email/spam/%d.txt' % i, 'r').read()) # 读取每个垃圾邮件,并字符串转换成字符串列表 # print(wordList) docList.append(wordList) fullText.extend(wordList) classList.append(1) # 标记垃圾邮件,1表示垃圾文件 wordList = textParse(open('email/ham/%d.txt' % i, 'r').read()) # 读取每个非垃圾邮件,并字符串转换成字符串列表 # print(wordList) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 标记非垃圾邮件,1表示垃圾文件 vocabList = bf.createVocabList(docList) # 创建词汇表,不重复 trainingSet = list(range(50)) # print(vocabList) testSet = [] # 创建存储训练集的索引值的列表和测试集的索引值的列表 for i in range(10): # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集 randIndex = int(random.uniform(0, len(trainingSet))) # 随机选取索索引值 testSet.append(trainingSet[randIndex]) # 添加测试集的索引值 del (trainingSet[randIndex]) # 在训练集列表中删除添加到测试集的索引值 trainMat = [] trainClasses = [] # 创建训练集矩阵和训练集类别标签系向量 for docIndex in trainingSet: # 遍历训练集 trainMat.append(bf.setOfWords2Vec( vocabList, docList[docIndex])) # 将生成的词集模型添加到训练矩阵中 trainClasses.append(classList[docIndex]) # 将类别添加到训练集类别标签系向量中 p0V, p1V, pSpam = of.trainNB0(np.array(trainMat), np.array(trainClasses)) # 训练朴素贝叶斯模型 errorCount = 0 # 错误分类计数 for docIndex in testSet: # 遍历测试集 wordVector = bf.setOfWords2Vec(vocabList, docList[docIndex]) # 测试集的词集模型 if te.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: # 如果分类错误 errorCount += 1 # 错误计数加1 print("分类错误的测试集:", docList[docIndex]) print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100))
def testingNB(): listOPosts, listClasses = bf.loadDataSet() # 创建实验样本 myVocabList = bf.createVocabList(listOPosts) # 创建词汇表 trainMat = [] for postinDoc in listOPosts: trainMat.append(bf.setOfWords2Vec(myVocabList, postinDoc)) # 将实验样本向量化 p0V, p1V, pAb = tt.trainNB0(np.array(trainMat), np.array(listClasses)) # 训练朴素贝叶斯分类器 testEntry = ['love', 'my', 'dalmation'] # 测试样本1 thisDoc = np.array(bf.setOfWords2Vec(myVocabList, testEntry)) # 测试样本向量化 if classifyNB(thisDoc, p0V, p1V, pAb): print(testEntry, '属于侮辱类') # 执行分类并打印分类结果 else: print(testEntry, '属于非侮辱类') # 执行分类并打印分类结果 testEntry = ['stupid', 'garbage'] # 测试样本2 thisDoc = np.array(bf.setOfWords2Vec(myVocabList, testEntry)) # 测试样本向量化 if classifyNB(thisDoc, p0V, p1V, pAb): print(testEntry, '属于侮辱类') # 执行分类并打印分类结果 else: print(testEntry, '属于非侮辱类') # 执行分类并打印分类结果
numWords = len(trainMatrix[0]) # 计算每篇文档的词条数 pAbusive = sum(trainCategory) / float(numTrainDocs) # 文档属于侮辱类的概率 p0Num = np.ones(numWords) p1Num = np.ones(numWords) # 创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑 p0Denom = 2.0 p1Denom = 2.0 # 分母初始化为2,拉普拉斯平滑 for i in range(numTrainDocs): # 统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: # 统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = np.log(p1Num / p1Denom) # 取对数,防止下溢出 p0Vect = np.log(p0Num / p0Denom) return p0Vect, p1Vect, pAbusive # 返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 if __name__ == '__main__': postingList, classVec = bf.loadDataSet() myVocabList = bf.createVocabList(postingList) trainMat = [] for postinDoc in postingList: trainMat.append(bf.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = trainNB0(trainMat, classVec) print('p0V:\n', p0V) print('p1V:\n', p1V) print('classVec:\n', classVec) print('pAb:\n', pAb)