def spamTest(): docList = []; classList = []; fullText = [] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' %i).read()) docList.append(wordList) fullText.append(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50); testSet = [] for i in range(10): randIdx = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIdx]) del(trainingSet[randIdx]) trainMat = []; trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount)/len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainingMat = [] trainingClasses = [] for docIndex in trainingSet: trainingMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainingClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNBO(trainingMat, trainingClasses) errorCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print ("the error rate is: ", float(errorCount)/len(testSet))
def testingNB(): # 1. 加载数据集 listOPosts, listClasses = bayes.loadDataSet() print('listOPosts: ', listOPosts, '\n************************************\nlistClasses: ', listClasses) # 2. 创建单词集合 myVocabList = bayes.createVocabList(listOPosts) # 3. 计算单词是否出现并创建数据矩阵 trainMat = [] for postinDoc in listOPosts: # 返回m * len(myVocabList)的矩阵,记录的都是0,1信息 # print('postinDoc:', postinDoc) trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # 4. 训练数据 p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses)) # 5. 测试数据 testEntry = ['love', 'my', 'dalmatioin'] thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): # 导入并解析文本 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] # 随机构建训练集 for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 # 对测试集分类 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is:', float(errorCount) / len(testSet)
def spamTest(): docList = []; classList = []; fullText = [] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) # range对象不支持del,所以要转成list trainingSet = list(range(50)); testSet = [] # 随机取10组数据作为测试机 for i in range(10): # 循环10次 # 在0到49之间(包括0,49)取随机整数 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) # 删除数组的引用 # 把剩下的40组数据作为训练集 trainMat = []; trainClasses = [] for docIndex in trainingSet: # 由于上面删除了10个引用,还剩40个 trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNBO(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: # 转成词向量 wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ', float(errorCount)/len(testSet))
def test_createVocablist(self): data_set, _ = bayes.loadDataSet() vocab_list = bayes.createVocabList(data_set) print("\n vocab_list == %s" % (vocab_list)) # 根据数据集第0行输出对应的向量表 # (即,第0行中所有单词,在整个data_set词汇表中出现的单词位置设置为1) vec = bayes.setOfWords2Vec(vocab_list, data_set[0]) print("\n vec == %s" % (vec)) vec = bayes.setOfWords2Vec(vocab_list, data_set[3]) print("\n vec == %s" % (vec))
def testingNB(): listOPosts, listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] tesDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] tesDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb))
def testingNB(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['quit', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
def spamTest(): docList = []; classList = []; fullText = [] # 读取spam下面的26个文件 for i in range(1, 26): # 垃圾邮件的分类 wordList = textParse(open('./data/spam/%d.txt' % i).read()) # 添加到docList中 docList.append(wordList) # extend是将list1的元素添加进来 fullText.extend(wordList) classList.append(1) # 正常邮件的分类 wordList = textParse(open('./data/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 创建词汇表:需要文档列表 postingList。 读取所有的txt并进行词汇切分 vocabList = bayes.createVocabList(docList) docMartix = np.zeros((50, len(vocabList))) i = 0 for document in docList: # 需要将每个文档进行转换成向量 docVec = bayes.setOfWords2Vec(vocabList, document) # 添加类别 docMartix[i, :] = docVec i += 1 print('词汇表长度:', len(vocabList)) print('词汇表内容:', vocabList) print('文档内容:', docList[0], '\n词汇数', len(docList[0])) print('文档矩阵:', docMartix[0], '\n词汇数', sum(docMartix[0])) # 训练贝叶斯 p0Vec, p1Vec, pAbusive = bayes.trainNBC(docMartix, classList) # print(p1Vec) # print(p0Vec) # print(pAbusive) # 测试函数 i = 0 errorCount = 0 for document in docList: testVec = bayes.setOfWords2Vec(vocabList, document) testClass = bayes.classifyNBC(np.array(testVec), p0Vec, p1Vec, pAbusive) if testClass != classList[i]: errorCount += 1 i += 1 print('error percent: ', errorCount/float(len(docList)))
def testingNB(): postList, classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postList) trainMat = [] for post in postList: trainMat.append(bayes.setOfWords2Vec(myVocabList, post)) p0V, p1V, pAb = bayes.trainNB0(trainMat, classList) testEntry = ['love', 'my', 'dalmation'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb)
def main(): postingList, classVec = bayes.loadDataSet() vlist = bayes.create_vacabulary_list(postingList) tranmat = [] for row in postingList: tranmat.append(bayes.setOfWords2Vec(vlist, row)) print bayes.trainNB0(tranmat, classVec)
def testingNB(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] print thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
def testNB(): listOPosts, listClasses = bayes.loadDataSet() #加载数据 myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) resultLabel = {0: 'Not garbage', 1: 'Garbage'} testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)]) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])
def test_train(): listOposts,listClasses=bayes.loadDataSet() myVocabList=bayes.createVocabList(listOposts) trainMat=[] for postinDoc in listOposts: trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) p0v,p1v,pab=bayes.trainNB0(trainMat,listClasses) print p1v
def test_setOfWords2Vec(self): # listOPosts is actually... # listClasses is actually a list of labels for the data in listOPosts listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) features = bayes.setOfWords2Vec(myVocabList, listOPosts[0]) expected = [ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1 ] self.assertEqual(features, expected)
def spamTest(): docList = [] classList = [] fullText = [] # parse text from email for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) #parse text wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #build training set and test set trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) #select testset and remove the testset from all dataset testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #classify and test precision errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): try: wordList = textParse( open('data/Ch04/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('data/Ch04/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) except Exception as e: print(str(e)) traceback.print_exc() print(i) exit() vocabList = createVocabList(docList) trainingSet = list(range(50)) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print(docList[docIndex]) print('the error rate is: ', float(errorCount) / len(testSet))
def test_train_nb(self): data_set, listClasses = bayes.loadDataSet() vocab_list = bayes.createVocabList(data_set) print("\n vocab_list == %s" % (vocab_list)) trainMat = [] for postinDoc in data_set: trainMat.append(bayes.setOfWords2Vec(vocab_list, postinDoc)) p0Vect, p1Vect, pAbusive = bayes.trainNB0(trainMat, listClasses) print("\n p0Vect == %s\n p1Vect == %s\n pAbusive == %s\n" % (p0Vect, p1Vect, pAbusive))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] # randomly split data set into 2 sets: test set, and training set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) # random int 0~len testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) # split trainMat = []; trainClasses = []; for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "error word: %s" % (docList[docIndex]) print "error rate is: %f", float(errorCount) / len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] # parse text from email for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) #parse text wordList = textParse(open('email/ham/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) #build training set and test set trainingSet = range(50); testSet =[] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) #select testset and remove the testset from all dataset testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #classify and test precision errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount)/len(testSet)
def testSimpTrain(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print "trainMat:", trainMat print "listClasses:", listClasses p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses) print "pAb:",pAb print "p0V:",p0V print "p1V:",p1V
def spamTest(): docList = [] classList =[] fullText = [] #导人文件夹spam与ham下的文本文件,并将它们解析为词列表 for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50);#本例中共有50封电子邮件,其中的值从0到49 testSet = [] '''选择出的数字所对应的文档被添加到测试集, 同时也将其从训练集中剔除。 ''' for i in range(10):#10封电子邮件被随机选择为测试集。 randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMatrix=[]; trainClasses = [] for docIndex in trainingSet: trainMatrix.append(bayes.setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = bayes.trainNB(array(trainMatrix),array(trainClasses)) errorCount=0 for docIndex in testSet: #如果邮件分类错误,则错误数加1,最后给出总的错误百分比 wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex]) if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]: errorCount +=1 print 'the error rate is :',float(errorCount) /len(testSet)
def spamTest(): docList = [] classList = [] #fullText = [] #没起作用 for i in range(1,26): #此案例中样本集名为1.txt~25.txt wordList = textParse(open('email/spam/%d.txt' % i).read()) #解析邮件,分隔成一个个词汇 docList.append(wordList) #将样本内容存储到docList中 #fullText.extend(wordList) classList.append(1) #spam下对应的类别标签设置为1 wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) #fullText.extend(wordList) classList.append(0) #ham下对应的类别标签设置为0 vocabList = bayes.createVocabList(docList) #通过docList获取全部的词汇表 trainingSet = list(range(50)) #此处共50个案例,与classList长度对应 testSet = [] #存储测试样本集 for i in list(range(10)): randIndex = int(random.uniform(0,len(trainingSet))) #随机提取样本作为测试样本 testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) #把测试样本从训练样本中剔除 trainMat = [] trainClasses = [] for docIndex in trainingSet:#遍历训练样本集 trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) #获取样本中使用词汇情况向量 trainClasses.append(classList[docIndex]) #获取当前样本的类别标签 p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #训练算法,得到概率 errorCount = 0 for docIndex in testSet: #遍历测试样本集 wordVector=bayes.setOfWords2Vec(vocabList, docList[docIndex]) resultFlag = bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) #使用分类函数进行分类 if(resultFlag != classList[docIndex]): #如果得到结果不正确,则错误数加上1 errorCount += 1 print('the error rate is: ', float(errorCount)/len(testSet))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] # 创建存储训练集的索引值的列表和测试集的索引值的列表 for i in range(10): # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集 randIndex = int(random.uniform(0, len( trainingSet))) #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high. testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNBO(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('wrong testSet:', docList[docIndex]) print('wrong rate:%.2f%%' % (float(errorCount) / len(testSet) * 100))
def test_trainNBO(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] # list of lists, e.g., [[...], ..., [...]] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # this is interesting as the names sent to the funtion imply # different types than the names received by the function. # Compare sending trainCategory to receiving listClasses. # There isn't even a hint of meaning between those two names # at the program (self-referentiall) perspective. # p0Vect, p1Vect, pAbusive = trainNBO(trainMatrix, trainCategory) p0V, p1V, pAb = bayes.trainNBO(trainMat, listClasses) # print p0V, p1V, pAb self.assertAlmostEqual(pAb, 0.5)
def spamTest(): """ 将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集, 50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证) :return: """ docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = list(range(50)) testSet = [] for i in range(10): #随机选出10篇 randIndex = int(random.uniform( 0, len(trainingSet))) #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。 testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: #遍历训练集中所有的文档 trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) #构建词向量 trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #计算分类所需的概率 errorCount = 0 for docIndex in testSet: #遍历测试集 wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is :', float(errorCount) / len(testSet)) return vocabList, p0V, p1V
def spamTest(): """ 对贝叶斯垃圾邮件分类器进行自动化处理 :return: """ docList = [] classList = [] fullText = [] for i in range(1, 26): # 1. 导入并解析文本文件 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.append(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.append(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): # 2. 随机构建训练集 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 # 3. 对测试集分类 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
#coding:utf-8 import bayes #文档集,标签集 listOPosts, listClasses = bayes.loadDataSet() #包含所有词的列表 myList = bayes.createVocabList(listOPosts) #print myList trainMat = [] #文档矩阵 for i in listOPosts: trainMat.append(bayes.setOfWords2Vec(myList, i)) p0Vect, p1Vect, pAbusive = bayes.trainNB0(trainMat, listClasses) print p0Vect print p1Vect print pAbusive # 侮辱性文档占总文档数目的概率 #print len(trainMat) #myVec = bayes.setOfWords2Vec(myList,listOPosts[0]) #得到第0篇文档的向量 #print myVec # vo = [1,2,3] # print vo.index(2) # print vo.index(3) # print vo.index(1)
return Vec, out if __name__ == '__main__': googDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\goods.txt' badDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\bad.txt' # 1 好评 0 差评 goodVec, goodList = DataHandle(googDataPath, 1) badVec, badList = DataHandle(badDataPath, 0) listClasses = goodVec + badVec listOPosts = goodList + badList print(listClasses) print(listOPosts) myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) # 3. 计算单词是否出现并创建数据矩阵 trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # 4. 训练数据 p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses)) # 5. 测试数据 while True: inputS = input(u'请输入您对本商品的评价:') testEntry = wordCut(inputS) thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print('评价: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb))
import bayes if __name__ == '__main__': listOposts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOposts) print myVocabList print bayes.setOfWords2Vec(myVocabList, listOposts[0]) print bayes.setOfWords2Vec(myVocabList, listOposts[3])
# coding: utf-8 import bayes listPosts, listClasses = bayes.loadDataSet() vocabList = bayes.createVocabList(listPosts) # print(vocabList) # print(bayes.setOfWords2Vec(vocabList, listPosts[0])) trainMat = [] for doc in listPosts: trainMat.append(bayes.setOfWords2Vec(vocabList, doc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print(pAb) print(p0V)
def simptestTest(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print myVocabList print listOPosts[0] print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
import bayes listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) print(bayes.setOfWords2Vec(myVocabList, listOPosts[0])) print(bayes.setOfWords2Vec(myVocabList, listOPosts[3]))
""" import bayes from imp import reload reload(bayes) # 调用生产测试数据的函数 listOposts, listClasses = bayes.loadDataSet() # 获取无重复、全部词的函数 myVocaBList = bayes.creatVocabList(listOposts) myVocaBList # 获得每个文本的词向量 bayes.setOfWords2Vec(myVocaBList, listOposts[0]) bayes.setOfWords2Vec(myVocaBList, listOposts[3]) bayes.setOfWords2Vec(myVocaBList, listOposts[4]) # 调用计算函数,计算各类先验概率等 import numpy as np trainMat = [] for postinDoc in listOposts: trainMat.append(bayes.setOfWords2Vec(myVocaBList, postinDoc)) p0v, p1v, pAb = bayes.trainNB0(trainMat, listClasses) pAb p0v p1v p2v = 1 - p1v
# -*- coding:utf-8 -*- from numpy import * import bayes import feedparser listOPosts, listClasses = bayes.loadDataSet() # print listOPosts # print listClasses myVocabList = bayes.createVocabList(listOPosts) # print myVocabList # print bayes.setOfWords2Vec(myVocabList, listOPosts[0]) # print bayes.setOfWords2Vec(myVocabList, listOPosts[3]) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # print trainMat p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) # print 'p0V:\n', p0V # print 'p1V:\n', p1V # print 'pAb:\n', pAb testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) # print testEntry, 'classify as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) # print testEntry, 'classify as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) # bayes.spamTest() # bayes.spamTest() ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
import bayes listOPosts,listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) vecList1 = bayes.setOfWords2Vec(myVocabList,listOPosts[0]) vecList4 = bayes.setOfWords2Vec(myVocabList,listOPosts[3]) print(vecList1) print(vecList4) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) print(trainMat) print(len(trainMat)) print(listClasses) p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses) # print(pAb) print(p0V) # print(p1V)
import bayes; listOPosts,listClasses = bayes.loadDataSet() print listOPosts; print listClasses; myVocabList = bayes.createVocabList(listOPosts); print myVocabList print bayes.setOfWords2Vec(myVocabList,listOPosts[0])
import bayes import feedparser listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print myVocabList print bayes.setOfWords2Vec(myVocabList, listOPosts[0]) print bayes.setOfWords2Vec(myVocabList, listOPosts[3]) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print pAb print p0V print p1V bayes.testingNB() print '==email classify==' bayes.spamTest() print '==email classify==' bayes.spamTest() print '==email classify==' bayes.spamTest() print '==feedparser classify==' ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') vocabList, pSF, pNY = bayes.localWords(ny, sf) vocabList, pSF, pNY = bayes.localWords(ny, sf)
def testSetOfWords2Vec(self): trainSet, classVec = BayesTestCase.loadDataSet() vocabList = bayes.createVocabList(trainSet) vec = bayes.setOfWords2Vec(vocabList, trainSet[0]) theVec = [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1] self.assertEqual(theVec, vec)
import bayes from numpy import * listOposts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOposts) trainMat = [] print("-----start for about trainMat----- ") for postinDoc in listOposts: print("postinDoc = ", postinDoc) trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print("trainMat = ", trainMat) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print("p0V = ", p0V) print("p1V = ", p1V) print("pAb = ", pAb)
# -*- coding: utf-8 -*- # 训练数据 from numpy import * import bayes postList, classVec = bayes.loadDataSet() vocabList = bayes.createVocabList(postList) mat = [] for i in postList: mat.append(bayes.setOfWords2Vec(vocabList, i)) p0, p1, pAbusive = bayes.trainNB0(mat, classVec) # 利用rss源文档测试 import bayes import feedparser sci_env = feedparser.parse( 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml') edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml') rate = 0.0 for i in range(10): vocabList, p0, p1, erate = bayes.localWords(sci_env, edu) rate += erate print "error rate: %f" % (rate / 10) # len(ny['entries']) # 获取各分类文档的出现次数最多的词 import bayes import feedparser sci_env = feedparser.parse( 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
__author__ = 'Dian' import bayes from numpy import * listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) # print("myVocabList:\n") # print(myVocabList) # print '\n' # print "the result of the first Email:\n" # print bayes.setOfWords2Vec(myVocabList, listOPosts[0]) # print '\n' # print "the result of the fourth Email:\n" # print bayes.setOfWords2Vec(myVocabList, listOPosts[3]) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) poV, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print "pAb = \n" print pAb print "p0V = \n" print poV print "p1V = \n" print p1V bayes.testingNB()
import bayes listOPosts, listClasses = bayes.loadDataSet() print(listOPosts) wordVec = bayes.createVocabList(listOPosts) print(wordVec) tt = bayes.setOfWords2Vec(wordVec, listOPosts[0]) print(tt)