Exemple #1
0
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50); testSet = []
    for i in range(10):
        randIdx = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIdx])
        del(trainingSet[randIdx])

    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount)/len(testSet)
Exemple #2
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainingMat = []
    trainingClasses = []
    for docIndex in trainingSet:
        trainingMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainingClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNBO(trainingMat, trainingClasses)
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print ("the error rate is: ", float(errorCount)/len(testSet))
Exemple #3
0
def testingNB():
    # 1. 加载数据集
    listOPosts, listClasses = bayes.loadDataSet()
    print('listOPosts: ', listOPosts,
          '\n************************************\nlistClasses: ', listClasses)

    # 2. 创建单词集合
    myVocabList = bayes.createVocabList(listOPosts)

    # 3. 计算单词是否出现并创建数据矩阵
    trainMat = []
    for postinDoc in listOPosts:
        # 返回m * len(myVocabList)的矩阵,记录的都是0,1信息
        # print('postinDoc:', postinDoc)
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))

    # 4. 训练数据

    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))

    # 5. 测试数据
    testEntry = ['love', 'my', 'dalmatioin']
    thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # 导入并解析文本
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    # 随机构建训练集
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    # 对测试集分类
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is:', float(errorCount) / len(testSet)
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    # range对象不支持del,所以要转成list
    trainingSet = list(range(50)); testSet = []
    # 随机取10组数据作为测试机
    for i in range(10):  # 循环10次
        # 在0到49之间(包括0,49)取随机整数
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  # 删除数组的引用
    # 把剩下的40组数据作为训练集
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:  # 由于上面删除了10个引用,还剩40个
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNBO(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        # 转成词向量
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
Exemple #6
0
    def test_createVocablist(self):
        data_set, _ = bayes.loadDataSet()
        vocab_list = bayes.createVocabList(data_set)
        print("\n vocab_list == %s" % (vocab_list))

        # 根据数据集第0行输出对应的向量表
        # (即,第0行中所有单词,在整个data_set词汇表中出现的单词位置设置为1)
        vec = bayes.setOfWords2Vec(vocab_list, data_set[0])
        print("\n vec == %s" % (vec))
        vec = bayes.setOfWords2Vec(vocab_list, data_set[3])
        print("\n vec == %s" % (vec))
Exemple #7
0
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    tesDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    tesDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb))
Exemple #8
0
def testingNB():
    listOPosts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses)

    testEntry = ['love', 'my', 'dalmation', 'stupid']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
    testEntry = ['quit', 'stupid']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
def spamTest():
    docList = []; classList = []; fullText = []
    # 读取spam下面的26个文件
    for i in range(1, 26):
        # 垃圾邮件的分类
        wordList = textParse(open('./data/spam/%d.txt' % i).read())
        # 添加到docList中
        docList.append(wordList)
        # extend是将list1的元素添加进来
        fullText.extend(wordList)
        classList.append(1)

        # 正常邮件的分类
        wordList = textParse(open('./data/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    # 创建词汇表:需要文档列表 postingList。 读取所有的txt并进行词汇切分
    vocabList = bayes.createVocabList(docList)

    docMartix = np.zeros((50, len(vocabList)))
    i = 0
    for document in docList:
        # 需要将每个文档进行转换成向量
        docVec = bayes.setOfWords2Vec(vocabList, document)
        # 添加类别
        docMartix[i, :] = docVec
        i += 1
    print('词汇表长度:', len(vocabList))
    print('词汇表内容:', vocabList)
    print('文档内容:', docList[0], '\n词汇数', len(docList[0]))
    print('文档矩阵:', docMartix[0], '\n词汇数', sum(docMartix[0]))

    # 训练贝叶斯
    p0Vec, p1Vec, pAbusive = bayes.trainNBC(docMartix, classList)
    # print(p1Vec)
    # print(p0Vec)
    # print(pAbusive)
    # 测试函数
    i = 0
    errorCount = 0
    for document in docList:
        testVec = bayes.setOfWords2Vec(vocabList, document)
        testClass = bayes.classifyNBC(np.array(testVec), p0Vec, p1Vec, pAbusive)
        if testClass != classList[i]:
            errorCount += 1
        i += 1
    print('error percent: ', errorCount/float(len(docList)))
def testingNB():
    postList, classList = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(postList)
    trainMat = []
    for post in postList:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, post))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, classList)
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry)
    print testEntry, 'classified as: ', bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry)
    print testEntry, 'classified as: ', bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
Exemple #11
0
def main():
    postingList, classVec = bayes.loadDataSet()
    vlist = bayes.create_vacabulary_list(postingList)
    tranmat = []
    for row in postingList:
        tranmat.append(bayes.setOfWords2Vec(vlist, row))
    print bayes.trainNB0(tranmat, classVec)
Exemple #12
0
def testingNB():
    listOPosts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOPosts)
    print(myVocabList)
    trainMat = []
    for postinDoc in listOPosts:

        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    print
    thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry))

    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
Exemple #13
0
def testNB():
    listOPosts, listClasses = bayes.loadDataSet()  #加载数据
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)

    resultLabel = {0: 'Not garbage', 1: 'Garbage'}
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:',
          resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])

    testEntry = ['stupid', 'garbage']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:',
          resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])
def test_train():
    listOposts,listClasses=bayes.loadDataSet()
    myVocabList=bayes.createVocabList(listOposts)
    trainMat=[]
    for postinDoc in listOposts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
    p0v,p1v,pab=bayes.trainNB0(trainMat,listClasses)

    print p1v
Exemple #15
0
 def test_setOfWords2Vec(self):
     # listOPosts is actually...
     # listClasses is actually a list of labels for the data in listOPosts
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     features = bayes.setOfWords2Vec(myVocabList, listOPosts[0])
     expected = [
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1
     ]
     self.assertEqual(features, expected)
Exemple #16
0
def spamTest():
    docList = []
    classList = []
    fullText = []

    # parse text from email
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #parse text
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)

    #build training set and test set
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        #select testset and remove the testset from all dataset
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))

    #classify and test precision
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
Exemple #17
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        try:
            wordList = textParse(
                open('data/Ch04/email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = textParse(open('data/Ch04/email/ham/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        except Exception as e:
            print(str(e))
            traceback.print_exc()
            print(i)
            exit()
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V,
                      pSpam) != classList[docIndex]:
            errorCount += 1
            print(docList[docIndex])
    print('the error rate is: ', float(errorCount) / len(testSet))
Exemple #18
0
    def test_train_nb(self):
        data_set, listClasses = bayes.loadDataSet()
        vocab_list = bayes.createVocabList(data_set)
        print("\n vocab_list == %s" % (vocab_list))

        trainMat = []
        for postinDoc in data_set:
            trainMat.append(bayes.setOfWords2Vec(vocab_list, postinDoc))

        p0Vect, p1Vect, pAbusive = bayes.trainNB0(trainMat, listClasses)
        print("\n p0Vect == %s\n p1Vect == %s\n pAbusive == %s\n" %
              (p0Vect, p1Vect, pAbusive))
def spamTest():
	docList 	= []
	classList 	= []
	fullText 	= []
	for i in range(1, 26):
		wordList = textParse(open('email/spam/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)

		wordList = textParse(open('email/ham/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)

	vocabList 	= bayes.createVocabList(docList)
	trainingSet = range(50)
	testSet 	= []

	# randomly split data set into 2 sets: test set, and training set
	for i in range(10):
		randIndex = int(random.uniform(0, len(trainingSet)))	# random int 0~len
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])		# split

	trainMat = [];
	trainClasses = [];
	for docIndex in trainingSet:
		trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
		trainClasses.append(classList[docIndex])

	p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
	errorCount = 0

	for docIndex in testSet:
		wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
		if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
			errorCount += 1
			print "error word: %s" % (docList[docIndex])
	print "error rate is: %f", float(errorCount) / len(testSet)
def spamTest():
    docList = []
    classList = []
    fullText = []

    # parse text from email
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #parse text
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)

    #build training set and test set
    trainingSet = range(50);
    testSet =[]
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        #select testset and remove the testset from all dataset
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))

    #classify and test precision
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount)/len(testSet)
Exemple #21
0
def testSimpTrain():
    listOPosts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))

    print "trainMat:", trainMat
    print "listClasses:", listClasses
    p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses)
    print "pAb:",pAb
    print "p0V:",p0V
    print "p1V:",p1V
def spamTest():
    docList = []
    classList =[]
    fullText = []

    #导人文件夹spam与ham下的文本文件,并将它们解析为词列表
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50);#本例中共有50封电子邮件,其中的值从0到49
    
    testSet = []    
    '''选择出的数字所对应的文档被添加到测试集, 同时也将其从训练集中剔除。    '''
    for i in range(10):#10封电子邮件被随机选择为测试集。
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMatrix=[];
    trainClasses = []
    for docIndex in trainingSet:
        trainMatrix.append(bayes.setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = bayes.trainNB(array(trainMatrix),array(trainClasses))
    
    errorCount=0
    for docIndex in testSet:
        #如果邮件分类错误,则错误数加1,最后给出总的错误百分比
        wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex])
        if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]:
            errorCount +=1
    print 'the error rate is :',float(errorCount) /len(testSet)
Exemple #23
0
def spamTest():
    docList = []
    classList = []
    #fullText = [] #没起作用

    for i in range(1,26): #此案例中样本集名为1.txt~25.txt
        wordList = textParse(open('email/spam/%d.txt' % i).read()) #解析邮件,分隔成一个个词汇
        docList.append(wordList)  #将样本内容存储到docList中
        #fullText.extend(wordList)
        classList.append(1) #spam下对应的类别标签设置为1
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        #fullText.extend(wordList)
        classList.append(0) #ham下对应的类别标签设置为0
    vocabList = bayes.createVocabList(docList) #通过docList获取全部的词汇表
    trainingSet = list(range(50)) #此处共50个案例,与classList长度对应
    
    testSet = [] #存储测试样本集
    for i in list(range(10)):
        randIndex = int(random.uniform(0,len(trainingSet))) #随机提取样本作为测试样本
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex]) #把测试样本从训练样本中剔除
    trainMat = []
    trainClasses = []

    for docIndex in trainingSet:#遍历训练样本集
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) #获取样本中使用词汇情况向量
        trainClasses.append(classList[docIndex])  #获取当前样本的类别标签
    p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #训练算法,得到概率
    errorCount = 0

    for docIndex in testSet: #遍历测试样本集
        wordVector=bayes.setOfWords2Vec(vocabList, docList[docIndex])
        resultFlag = bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) #使用分类函数进行分类
        if(resultFlag != classList[docIndex]): #如果得到结果不正确,则错误数加上1
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
Exemple #24
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []  # 创建存储训练集的索引值的列表和测试集的索引值的列表
    for i in range(10):  # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集
        randIndex = int(random.uniform(0, len(
            trainingSet)))  #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high.
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNBO(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
            print('wrong testSet:', docList[docIndex])
    print('wrong rate:%.2f%%' % (float(errorCount) / len(testSet) * 100))
Exemple #25
0
 def test_trainNBO(self):
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     trainMat = [] # list of lists, e.g., [[...], ..., [...]]
     for postinDoc in listOPosts:
         trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
     # this is interesting as the names sent to the funtion imply
     # different types than the names received by the function.
     # Compare sending trainCategory to receiving listClasses.
     # There isn't even a hint of meaning between those two names
     # at the program (self-referentiall) perspective.
     # p0Vect, p1Vect, pAbusive = trainNBO(trainMatrix, trainCategory)
     p0V, p1V, pAb = bayes.trainNBO(trainMat, listClasses)
     # print p0V, p1V, pAb
     self.assertAlmostEqual(pAb, 0.5)
Exemple #26
0
def spamTest():
    """
    将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集,
    50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证)
    :return: 
    """
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):  #随机选出10篇
        randIndex = int(random.uniform(
            0,
            len(trainingSet)))  #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  #遍历训练集中所有的文档
        trainMat.append(bayes.bagOfWords2VecMN(vocabList,
                                               docList[docIndex]))  #构建词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat),
                                     array(trainClasses))  #计算分类所需的概率
    errorCount = 0
    for docIndex in testSet:  #遍历测试集
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is :', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
Exemple #27
0
def spamTest():
    """
    对贝叶斯垃圾邮件分类器进行自动化处理
    :return:
    """
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # 1. 导入并解析文本文件
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        # 2. 随机构建训练集
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    # 3. 对测试集分类
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
Exemple #28
0
#coding:utf-8

import bayes

#文档集,标签集
listOPosts, listClasses = bayes.loadDataSet()

#包含所有词的列表
myList = bayes.createVocabList(listOPosts)
#print myList

trainMat = []  #文档矩阵
for i in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myList, i))
p0Vect, p1Vect, pAbusive = bayes.trainNB0(trainMat, listClasses)
print p0Vect
print p1Vect
print pAbusive  # 侮辱性文档占总文档数目的概率
#print len(trainMat)

#myVec = bayes.setOfWords2Vec(myList,listOPosts[0]) #得到第0篇文档的向量
#print myVec

# vo = [1,2,3]
# print vo.index(2)
# print vo.index(3)
# print vo.index(1)
Exemple #29
0
    return Vec, out

if __name__ == '__main__':
    googDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\goods.txt'
    badDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\bad.txt'

    # 1 好评     0 差评
    goodVec, goodList = DataHandle(googDataPath, 1)
    badVec, badList = DataHandle(badDataPath, 0)

    listClasses = goodVec + badVec
    listOPosts = goodList + badList
    print(listClasses)
    print(listOPosts)

    myVocabList = bayes.createVocabList(listOPosts)
    print(myVocabList)
    # 3. 计算单词是否出现并创建数据矩阵
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    # 4. 训练数据
    p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses))
    # 5. 测试数据
    while True:
        inputS = input(u'请输入您对本商品的评价:')

        testEntry = wordCut(inputS)
        thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
        print('评价: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb))
Exemple #30
0
import bayes

if __name__ == '__main__':
    listOposts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOposts)
    print myVocabList
    print bayes.setOfWords2Vec(myVocabList, listOposts[0])
    print bayes.setOfWords2Vec(myVocabList, listOposts[3])
Exemple #31
0
# coding: utf-8

import bayes

listPosts, listClasses = bayes.loadDataSet()
vocabList = bayes.createVocabList(listPosts)
# print(vocabList)
# print(bayes.setOfWords2Vec(vocabList, listPosts[0]))
trainMat = []
for doc in listPosts:
    trainMat.append(bayes.setOfWords2Vec(vocabList, doc))

p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
print(pAb)
print(p0V)
Exemple #32
0
def simptestTest():
	listOPosts, listClasses = bayes.loadDataSet()
	myVocabList = bayes.createVocabList(listOPosts)
	print myVocabList
	print listOPosts[0]
	print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
import bayes

listOPosts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
print(myVocabList)
print(bayes.setOfWords2Vec(myVocabList, listOPosts[0]))
print(bayes.setOfWords2Vec(myVocabList, listOPosts[3]))
Exemple #34
0
"""

import bayes
from imp import reload

reload(bayes)

# 调用生产测试数据的函数
listOposts, listClasses = bayes.loadDataSet()

# 获取无重复、全部词的函数
myVocaBList = bayes.creatVocabList(listOposts)
myVocaBList

# 获得每个文本的词向量
bayes.setOfWords2Vec(myVocaBList, listOposts[0])
bayes.setOfWords2Vec(myVocaBList, listOposts[3])
bayes.setOfWords2Vec(myVocaBList, listOposts[4])

# 调用计算函数,计算各类先验概率等
import numpy as np
trainMat = []
for postinDoc in listOposts:
    trainMat.append(bayes.setOfWords2Vec(myVocaBList, postinDoc))

p0v, p1v, pAb = bayes.trainNB0(trainMat, listClasses)
pAb
p0v
p1v
p2v = 1 - p1v
# -*- coding:utf-8 -*-
from numpy import *
import bayes
import feedparser

listOPosts, listClasses = bayes.loadDataSet()
# print listOPosts
# print listClasses
myVocabList = bayes.createVocabList(listOPosts)
# print myVocabList
# print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
# print bayes.setOfWords2Vec(myVocabList, listOPosts[3])
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
# print trainMat
p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
# print 'p0V:\n', p0V
# print 'p1V:\n', p1V
# print 'pAb:\n', pAb
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
# print testEntry, 'classify as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
testEntry = ['stupid', 'garbage']
thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
# print testEntry, 'classify as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
# bayes.spamTest()
# bayes.spamTest()

ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
Exemple #36
0
import bayes

listOPosts,listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)

print(myVocabList)

vecList1 = bayes.setOfWords2Vec(myVocabList,listOPosts[0])
vecList4 = bayes.setOfWords2Vec(myVocabList,listOPosts[3])
print(vecList1)
print(vecList4)

trainMat = []
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))

print(trainMat)
print(len(trainMat))
print(listClasses)

p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses)
# print(pAb)
print(p0V)
# print(p1V)
Exemple #37
0
import bayes;

listOPosts,listClasses = bayes.loadDataSet()

print listOPosts;
print listClasses;

myVocabList = bayes.createVocabList(listOPosts);

print myVocabList
print bayes.setOfWords2Vec(myVocabList,listOPosts[0])
import bayes
import feedparser

listOPosts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
print myVocabList
print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
print bayes.setOfWords2Vec(myVocabList, listOPosts[3])

trainMat = []
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
print pAb
print p0V
print p1V

bayes.testingNB()

print '==email classify=='
bayes.spamTest()
print '==email classify=='
bayes.spamTest()
print '==email classify=='
bayes.spamTest()

print '==feedparser classify=='
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
vocabList, pSF, pNY = bayes.localWords(ny, sf)
vocabList, pSF, pNY = bayes.localWords(ny, sf)
Exemple #39
0
	def testSetOfWords2Vec(self):
		trainSet, classVec = BayesTestCase.loadDataSet()
		vocabList = bayes.createVocabList(trainSet)
		vec = bayes.setOfWords2Vec(vocabList, trainSet[0])
		theVec = [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1]
		self.assertEqual(theVec, vec)
import bayes
from numpy import *
listOposts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOposts)
trainMat = []
print("-----start for about trainMat----- ")
for postinDoc in listOposts:
    print("postinDoc = ", postinDoc)
    trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    print("trainMat = ", trainMat)

p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
print("p0V = ", p0V)
print("p1V = ", p1V)
print("pAb = ", pAb)
# -*- coding: utf-8 -*-

# 训练数据
from numpy import *
import bayes
postList, classVec = bayes.loadDataSet()
vocabList = bayes.createVocabList(postList)
mat = []
for i in postList:
    mat.append(bayes.setOfWords2Vec(vocabList, i))
p0, p1, pAbusive = bayes.trainNB0(mat, classVec)

# 利用rss源文档测试
import bayes
import feedparser
sci_env = feedparser.parse(
    'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml')
rate = 0.0
for i in range(10):
    vocabList, p0, p1, erate = bayes.localWords(sci_env, edu)
    rate += erate

print "error rate: %f" % (rate / 10)
# len(ny['entries'])

# 获取各分类文档的出现次数最多的词
import bayes
import feedparser
sci_env = feedparser.parse(
    'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
__author__ = 'Dian'
import bayes
from numpy import *
listOPosts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
# print("myVocabList:\n")
# print(myVocabList)
# print '\n'
# print "the result of the first Email:\n"
# print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
# print '\n'
# print "the result of the fourth Email:\n"
# print bayes.setOfWords2Vec(myVocabList, listOPosts[3])
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
poV, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
print "pAb = \n"
print pAb
print "p0V = \n"
print poV
print "p1V = \n"
print p1V
bayes.testingNB()
Exemple #43
0
import bayes

listOPosts, listClasses = bayes.loadDataSet()

print(listOPosts)

wordVec = bayes.createVocabList(listOPosts)

print(wordVec)

tt = bayes.setOfWords2Vec(wordVec, listOPosts[0])

print(tt)